use crate::{Device, Result, Tensor};
use std::collections::HashMap;
use std::time::Instant;
#[derive(Debug, Clone)]
pub struct SimpleBenchmarkConfig {
pub warmup_iterations: usize,
pub benchmark_iterations: usize,
pub test_sizes: Vec<Vec<usize>>,
}
impl Default for SimpleBenchmarkConfig {
fn default() -> Self {
Self {
warmup_iterations: 5,
benchmark_iterations: 20,
test_sizes: vec![
vec![256, 256], vec![512, 512], vec![1024, 1024], ],
}
}
}
#[derive(Debug, Clone)]
pub struct SimpleBenchmarkResult {
pub benchmark_name: String,
pub execution_time_ms: f64,
pub throughput_ops_per_sec: f64,
pub memory_usage_mb: f64,
}
pub struct SimpleBenchmarkSuite {
config: SimpleBenchmarkConfig,
results: HashMap<String, SimpleBenchmarkResult>,
}
impl SimpleBenchmarkSuite {
pub fn new(config: SimpleBenchmarkConfig) -> Self {
Self {
config,
results: HashMap::new(),
}
}
pub fn run_benchmarks(&mut self) -> Result<BenchmarkReport> {
println!("🚀 Running TenfloweRS Performance Benchmarks...");
self.benchmark_tensor_creation()?;
self.benchmark_element_wise_operations()?;
self.benchmark_matrix_operations()?;
Ok(BenchmarkReport {
results: self.results.clone(),
summary: self.generate_summary(),
})
}
fn benchmark_tensor_creation(&mut self) -> Result<()> {
println!(" 📊 Benchmarking tensor creation...");
for size in &self.config.test_sizes {
if size.len() == 2 {
let rows = size[0];
let cols = size[1];
for _ in 0..self.config.warmup_iterations {
let _: Tensor<f32> = Tensor::zeros(&[rows, cols]);
}
let start = Instant::now();
for _ in 0..self.config.benchmark_iterations {
let _: Tensor<f32> = Tensor::zeros(&[rows, cols]);
}
let elapsed = start.elapsed();
let avg_time_ms =
elapsed.as_secs_f64() * 1000.0 / self.config.benchmark_iterations as f64;
let ops_per_sec = 1000.0 / avg_time_ms;
let memory_mb = (rows * cols * 4) as f64 / (1024.0 * 1024.0);
let result = SimpleBenchmarkResult {
benchmark_name: format!("TensorCreation_{}x{}", rows, cols),
execution_time_ms: avg_time_ms,
throughput_ops_per_sec: ops_per_sec,
memory_usage_mb: memory_mb,
};
self.results.insert(result.benchmark_name.clone(), result);
}
}
Ok(())
}
fn benchmark_element_wise_operations(&mut self) -> Result<()> {
println!(" ⚡ Benchmarking element-wise operations...");
for size in &self.config.test_sizes {
if size.len() == 2 {
let rows = size[0];
let cols = size[1];
let a: Tensor<f32> = Tensor::ones(&[rows, cols]);
let b: Tensor<f32> = Tensor::ones(&[rows, cols]);
for _ in 0..self.config.warmup_iterations {
let _ = a.add(&b)?;
}
let start = Instant::now();
for _ in 0..self.config.benchmark_iterations {
let _ = a.add(&b)?;
}
let elapsed = start.elapsed();
let avg_time_ms =
elapsed.as_secs_f64() * 1000.0 / self.config.benchmark_iterations as f64;
let elements_per_sec = (rows * cols) as f64 / (avg_time_ms / 1000.0);
let memory_mb = (rows * cols * 3 * 4) as f64 / (1024.0 * 1024.0);
let result = SimpleBenchmarkResult {
benchmark_name: format!("ElementwiseAdd_{}x{}", rows, cols),
execution_time_ms: avg_time_ms,
throughput_ops_per_sec: elements_per_sec,
memory_usage_mb: memory_mb,
};
self.results.insert(result.benchmark_name.clone(), result);
}
}
Ok(())
}
fn benchmark_matrix_operations(&mut self) -> Result<()> {
println!(" 🔢 Benchmarking matrix operations...");
for size in &self.config.test_sizes {
if size.len() == 2 {
let rows = size[0];
let cols = size[1];
let a: Tensor<f32> = Tensor::ones(&[rows, cols]);
let b: Tensor<f32> = Tensor::ones(&[cols, rows]);
for _ in 0..self.config.warmup_iterations {
let _ = a.matmul(&b)?;
}
let start = Instant::now();
for _ in 0..self.config.benchmark_iterations {
let _ = a.matmul(&b)?;
}
let elapsed = start.elapsed();
let avg_time_ms =
elapsed.as_secs_f64() * 1000.0 / self.config.benchmark_iterations as f64;
let flops = (rows * cols * cols * 2) as f64; let gflops_per_sec = flops / (avg_time_ms / 1000.0) / 1e9;
let memory_mb = (rows * cols * 2 + rows * rows) as f64 * 4.0 / (1024.0 * 1024.0);
let result = SimpleBenchmarkResult {
benchmark_name: format!("MatMul_{}x{}", rows, cols),
execution_time_ms: avg_time_ms,
throughput_ops_per_sec: gflops_per_sec,
memory_usage_mb: memory_mb,
};
self.results.insert(result.benchmark_name.clone(), result);
}
}
Ok(())
}
fn generate_summary(&self) -> BenchmarkSummary {
let mut total_time = 0.0f64;
let mut max_throughput = 0.0f64;
let mut total_memory = 0.0f64;
for result in self.results.values() {
total_time += result.execution_time_ms;
max_throughput = max_throughput.max(result.throughput_ops_per_sec);
total_memory += result.memory_usage_mb;
}
BenchmarkSummary {
total_benchmarks: self.results.len(),
average_execution_time_ms: total_time / self.results.len() as f64,
peak_throughput: max_throughput,
total_memory_usage_mb: total_memory,
performance_score: Self::calculate_performance_score(&self.results),
}
}
fn calculate_performance_score(results: &HashMap<String, SimpleBenchmarkResult>) -> f64 {
let mut score = 0.0;
let mut count = 0;
for result in results.values() {
let normalized_score = (result.throughput_ops_per_sec / 1e6).min(10.0); score += normalized_score;
count += 1;
}
if count > 0 {
score / count as f64
} else {
0.0
}
}
}
#[derive(Debug, Clone)]
pub struct BenchmarkReport {
pub results: HashMap<String, SimpleBenchmarkResult>,
pub summary: BenchmarkSummary,
}
#[derive(Debug, Clone)]
pub struct BenchmarkSummary {
pub total_benchmarks: usize,
pub average_execution_time_ms: f64,
pub peak_throughput: f64,
pub total_memory_usage_mb: f64,
pub performance_score: f64,
}
impl BenchmarkReport {
pub fn print_report(&self) {
println!("\n🎯 === TENFLOWERS PERFORMANCE BENCHMARK REPORT ===");
println!("\n📊 Individual Benchmark Results:");
for (name, result) in &self.results {
println!(
" {} - {:.2}ms, {:.1} ops/s, {:.1} MB",
name,
result.execution_time_ms,
result.throughput_ops_per_sec,
result.memory_usage_mb
);
}
println!("\n📈 Summary:");
println!(" Total Benchmarks: {}", self.summary.total_benchmarks);
println!(
" Average Execution Time: {:.2}ms",
self.summary.average_execution_time_ms
);
println!(
" Peak Throughput: {:.1} ops/s",
self.summary.peak_throughput
);
println!(
" Total Memory Usage: {:.1} MB",
self.summary.total_memory_usage_mb
);
println!(
" Performance Score: {:.2}/10",
self.summary.performance_score
);
match self.summary.performance_score {
score if score >= 8.0 => println!(" ✅ Excellent Performance!"),
score if score >= 6.0 => println!(" ✅ Good Performance"),
score if score >= 4.0 => println!(" ⚠️ Moderate Performance"),
_ => println!(" ❌ Performance Needs Improvement"),
}
}
}
pub fn run_simple_benchmarks() -> Result<BenchmarkReport> {
let config = SimpleBenchmarkConfig::default();
let mut suite = SimpleBenchmarkSuite::new(config);
suite.run_benchmarks()
}
pub fn validate_optimizations() -> Result<()> {
println!("🔍 === OPTIMIZATION VALIDATION ===");
println!("⚡ Testing CPU Performance...");
let _cpu_device = Device::Cpu;
let a: Tensor<f32> = Tensor::ones(&[1000, 1000]);
let b: Tensor<f32> = Tensor::ones(&[1000, 1000]);
let start = Instant::now();
let _result = a.matmul(&b)?;
let cpu_time = start.elapsed();
println!(
" CPU MatMul (1000x1000): {:.2}ms",
cpu_time.as_secs_f64() * 1000.0
);
println!("📊 Testing SIMD Effectiveness...");
let large_tensor: Tensor<f32> = Tensor::ones(&[10000]);
let another_tensor: Tensor<f32> = Tensor::ones(&[10000]);
let start = Instant::now();
let _result = large_tensor.add(&another_tensor)?;
let simd_time = start.elapsed();
println!(
" Element-wise Add (10k elements): {:.2}ms",
simd_time.as_secs_f64() * 1000.0
);
println!("💾 Testing Memory Efficiency...");
let memory_test_size = 5000;
let start = Instant::now();
let _large_matrix: Tensor<f32> = Tensor::zeros(&[memory_test_size, memory_test_size]);
let memory_time = start.elapsed();
let memory_mb = (memory_test_size * memory_test_size * 4) as f64 / (1024.0 * 1024.0);
let allocation_rate = memory_mb / memory_time.as_secs_f64();
println!(
" Memory Allocation ({}MB): {:.2}ms, {:.1} MB/s",
memory_mb,
memory_time.as_secs_f64() * 1000.0,
allocation_rate
);
println!("✅ Optimization validation complete!");
Ok(())
}