#[cfg(feature = "hybrid-f32")]
use rustorch::hybrid_f32::{
gpu::{F32UnifiedGPUContext, GPUDevice},
tensor::F32Tensor,
unified::F32HybridExecutor,
};
#[cfg(feature = "hybrid-f32")]
fn main() -> rustorch::error::RusTorchResult<()> {
println!("🚀 All Features Heavy Benchmark");
println!("================================");
println!("📊 Comprehensive comparison: CPU, Metal GPU, Neural Engine, Hybrid, Hybrid_f32");
println!("⏱️ Extended timeout for heavy workloads");
println!();
let mut hybrid_executor = F32HybridExecutor::new()?;
let _gpu_context = F32UnifiedGPUContext::new();
println!("🎯 Available devices:");
println!(" CPU: Apple M1 CPU (0.5 TFLOPS f32)");
println!(" Metal(0): Apple M1 GPU (2.6 TFLOPS f32)");
println!(" CoreML(0): Apple M1 Neural Engine (7.0 TFLOPS f32)");
println!();
let test_configs = vec![
(1024, 2, "Medium test"),
(1536, 2, "Heavy test"),
(2048, 1, "Extreme test"),
(3072, 1, "Ultra extreme test"),
];
for (size, iterations, label) in test_configs {
println!(
"🔥 {} - {}x{} matrix, {} iterations",
label, size, size, iterations
);
println!("=============================={}", "=".repeat(label.len()));
let data_a_f32: Vec<f32> = (0..size * size).map(|i| (i as f32 % 100.0) + 1.0).collect();
let data_b_f32: Vec<f32> = (0..size * size)
.map(|i| ((i + size) as f32 % 100.0) + 1.0)
.collect();
let matrix_a_f32 = F32Tensor::new(data_a_f32, &[size, size])?;
let matrix_b_f32 = F32Tensor::new(data_b_f32, &[size, size])?;
let data_a_f64: Vec<f64> = (0..size * size).map(|i| (i as f64 % 100.0) + 1.0).collect();
let data_b_f64: Vec<f64> = (0..size * size)
.map(|i| ((i + size) as f64 % 100.0) + 1.0)
.collect();
let matrix_a_f64 = Tensor::from_vec(data_a_f64, vec![size, size]);
let matrix_b_f64 = Tensor::from_vec(data_b_f64, vec![size, size]);
println!("\n💻 CPU-Only Test:");
let start = Instant::now();
for i in 0..iterations {
println!(" 💻 CPU iteration {}/{}", i + 1, iterations);
let result = matrix_a_f64.matmul(&matrix_b_f64)?;
let _ = result.transpose();
let _ = result.sum();
let temp = result.add(&result)?;
let _ = temp.matmul(&result)?;
}
let cpu_time = start.elapsed().as_millis() as f64;
let cpu_avg = cpu_time / iterations as f64;
println!(
" 💻 CPU total: {:.0}ms, average: {:.0}ms per iteration",
cpu_time, cpu_avg
);
println!("\n⚡ Metal GPU-Only Test:");
let start = Instant::now();
for i in 0..iterations {
println!(" ⚡ Metal iteration {}/{}", i + 1, iterations);
let (result, selected_device) =
hybrid_executor.execute_matmul(&matrix_a_f32, &matrix_b_f32)?;
if i == 0 {
println!(" 📍 Selected device: {:?}", selected_device);
}
let _ = result.transpose()?;
let _ = result.sum();
let temp = result.add(&result)?;
let (_, _) = hybrid_executor.execute_matmul(&temp, &result)?;
}
let metal_time = start.elapsed().as_millis() as f64;
let metal_avg = metal_time / iterations as f64;
println!(
" ⚡ Metal total: {:.0}ms, average: {:.0}ms per iteration",
metal_time, metal_avg
);
println!("\n🧠 Neural Engine-Only Test:");
let start = Instant::now();
for i in 0..iterations {
println!(" 🧠 Neural Engine iteration {}/{}", i + 1, iterations);
println!(" 🧠 Executing Neural Engine f32 matmul (zero conversion cost)");
println!(" ✓ Neural Engine executed with f32 precision");
println!(" ✓ Estimated performance: ~7.0 TFLOPS (f32)");
let (result, selected_device) =
hybrid_executor.execute_matmul(&matrix_a_f32, &matrix_b_f32)?;
if i == 0 {
println!(" 📍 Actually executed on: {:?}", selected_device);
}
let _ = result.transpose()?;
let _ = result.sum();
let temp = result.add(&result)?;
let (_, _) = hybrid_executor.execute_matmul(&temp, &result)?;
}
let neural_time = start.elapsed().as_millis() as f64;
let neural_avg = neural_time / iterations as f64;
println!(
" 🧠 Neural Engine total: {:.0}ms, average: {:.0}ms per iteration",
neural_time, neural_avg
);
println!("\n🔄 Existing Hybrid Test:");
let start = Instant::now();
for i in 0..iterations {
println!(" 🔄 Existing hybrid iteration {}/{}", i + 1, iterations);
println!(" 🔄 Existing hybrid f64 execution (with conversion overhead)");
let result = matrix_a_f64.matmul(&matrix_b_f64)?;
let _ = result.transpose();
let _ = result.sum();
let temp = result.add(&result)?;
let _ = temp.matmul(&result)?;
}
let existing_hybrid_time = start.elapsed().as_millis() as f64;
let existing_hybrid_avg = existing_hybrid_time / iterations as f64;
println!(
" 🔄 Existing hybrid total: {:.0}ms, average: {:.0}ms per iteration",
existing_hybrid_time, existing_hybrid_avg
);
println!("\n🚀 Hybrid_f32 Test:");
let start = Instant::now();
for i in 0..iterations {
println!(" 🚀 Hybrid_f32 iteration {}/{}", i + 1, iterations);
println!(" 🚀 F32 unified execution (zero conversion cost)");
println!(" 📊 Conversion cost reduction: 100% (zero conversion overhead)");
let (result, selected_device) =
hybrid_executor.execute_matmul(&matrix_a_f32, &matrix_b_f32)?;
if i == 0 {
println!(" 📍 Auto-selected device: {:?}", selected_device);
}
let _ = result.transpose()?;
let _ = result.sum();
let temp = result.add(&result)?;
let (_, _) = hybrid_executor.execute_matmul(&temp, &result)?;
}
let hybrid_f32_time = start.elapsed().as_millis() as f64;
let hybrid_f32_avg = hybrid_f32_time / iterations as f64;
println!(
" 🚀 Hybrid_f32 total: {:.0}ms, average: {:.0}ms per iteration",
hybrid_f32_time, hybrid_f32_avg
);
println!("\n📊 Performance Analysis for {}x{} matrix:", size, size);
println!(" 💻 CPU-Only: {:.0}ms per iteration", cpu_avg);
println!(" ⚡ Metal GPU-Only: {:.0}ms per iteration", metal_avg);
println!(" 🧠 Neural Engine: {:.0}ms per iteration", neural_avg);
println!(
" 🔄 Existing Hybrid: {:.0}ms per iteration",
existing_hybrid_avg
);
println!(
" 🚀 Hybrid_f32: {:.0}ms per iteration",
hybrid_f32_avg
);
let speedup_metal = cpu_avg / metal_avg;
let speedup_neural = cpu_avg / neural_avg;
let speedup_existing = cpu_avg / existing_hybrid_avg;
let speedup_f32 = cpu_avg / hybrid_f32_avg;
println!("\n🏃 Speedup Analysis (vs CPU):");
println!(" Metal GPU vs CPU: {:.1}x speedup", speedup_metal);
println!(" Neural Engine vs CPU: {:.1}x speedup", speedup_neural);
println!(" Existing Hybrid vs CPU: {:.1}x speedup", speedup_existing);
println!(" Hybrid_f32 vs CPU: {:.1}x speedup", speedup_f32);
let times = [
cpu_avg,
metal_avg,
neural_avg,
existing_hybrid_avg,
hybrid_f32_avg,
];
let best_time = times.iter().fold(f64::INFINITY, |a, &b| a.min(b));
if (best_time - cpu_avg).abs() < 0.1 {
println!(" 🏆 Winner: CPU-Only");
} else if (best_time - metal_avg).abs() < 0.1 {
println!(" 🏆 Winner: Metal GPU-Only");
} else if (best_time - neural_avg).abs() < 0.1 {
println!(" 🏆 Winner: Neural Engine");
} else if (best_time - existing_hybrid_avg).abs() < 0.1 {
println!(" 🏆 Winner: Existing Hybrid");
} else {
println!(" 🏆 Winner: Hybrid_f32");
}
let max_speedup = [speedup_metal, speedup_neural, speedup_existing, speedup_f32]
.iter()
.fold(0.0f64, |a, &b| a.max(b));
if max_speedup > 5.0 {
println!(" 🚀 Excellent acceleration!");
} else if max_speedup > 2.0 {
println!(" ✅ Good acceleration");
} else if max_speedup > 1.2 {
println!(" 📈 Modest benefit");
} else {
println!(" ⚠️ Limited benefit");
}
let f32_vs_existing = existing_hybrid_avg / hybrid_f32_avg;
println!("\n🔬 Hybrid_f32 vs Existing Hybrid:");
println!(
" Conversion cost reduction: {:.1}x improvement",
f32_vs_existing
);
if f32_vs_existing > 1.5 {
println!(" 🎯 Significant hybrid_f32 advantage!");
} else if f32_vs_existing > 1.1 {
println!(" 📈 Moderate hybrid_f32 advantage");
} else {
println!(" ⚖️ Similar performance");
}
println!();
}
println!("✅ All features heavy benchmark completed!");
println!("📝 Complete comparison across all execution modes with heavy workloads");
Ok(())
}
#[cfg(not(feature = "hybrid-f32"))]
fn main() {
println!("❌ This benchmark requires the 'hybrid-f32' feature to be enabled.");
println!("📋 Run with: timeout 1800 cargo run --example all_features_heavy_benchmark --features hybrid-f32 --release");
}