#[cfg(feature = "hybrid-f32")]
use rustorch::hybrid_f32::{
gpu::F32UnifiedGPUContext, tensor::F32Tensor, unified::F32HybridExecutor,
};
#[cfg(feature = "hybrid-f32")]
use rustorch::gpu::{hybrid_executor::HybridExecutor, DeviceType, OpType};
#[cfg(feature = "hybrid-f32")]
fn main() -> rustorch::error::RusTorchResult<()> {
println!("🚀 Focused Comparison Benchmark");
println!("================================");
println!("📊 Detailed comparison: CPU, Metal GPU, True Existing Hybrid, Hybrid_f32");
println!("⏱️ Optimized for comprehensive results within reasonable time");
println!();
let mut hybrid_executor = F32HybridExecutor::new()?;
let _gpu_context = F32UnifiedGPUContext::new();
let existing_hybrid_executor = HybridExecutor::new();
println!("🎯 Target modes for focused comparison:");
println!(" CPU: Apple M1 CPU (0.5 TFLOPS f32) - Baseline");
println!(" Metal GPU: Apple M1 GPU (2.6 TFLOPS f32) - GPU acceleration");
println!(" True Existing Hybrid: Metal(0) → CoreML(0) → CPU (improved chain)");
println!(" Hybrid_f32: f32 unified with zero conversion cost");
println!();
let test_size = 2048;
let iterations = 1;
println!(
"🔥 Focused Test - {}x{} matrix, {} iterations",
test_size, test_size, iterations
);
println!(
"Memory usage: ~{:.1} GB per matrix",
(test_size * test_size * 4) as f64 / 1_000_000_000.0
);
println!("=====================================");
println!("📊 Creating test matrices...");
let data_a_f32: Vec<f32> = (0..test_size * test_size)
.map(|i| (i as f32 % 100.0) + 1.0)
.collect();
let data_b_f32: Vec<f32> = (0..test_size * test_size)
.map(|i| ((i + test_size) as f32 % 100.0) + 1.0)
.collect();
let matrix_a_f32 = F32Tensor::new(data_a_f32, &[test_size, test_size])?;
let matrix_b_f32 = F32Tensor::new(data_b_f32, &[test_size, test_size])?;
let data_a_f64: Vec<f64> = (0..test_size * test_size)
.map(|i| (i as f64 % 100.0) + 1.0)
.collect();
let data_b_f64: Vec<f64> = (0..test_size * test_size)
.map(|i| ((i + test_size) as f64 % 100.0) + 1.0)
.collect();
let matrix_a_f64 = Tensor::from_vec(data_a_f64, vec![test_size, test_size]);
let matrix_b_f64 = Tensor::from_vec(data_b_f64, vec![test_size, test_size]);
let perform_standard_operations_f32 = |a: &F32Tensor,
b: &F32Tensor,
executor: &mut F32HybridExecutor|
-> rustorch::error::RusTorchResult<f64> {
let start = Instant::now();
let (result1, _) = executor.execute_matmul(a, b)?;
let result2 = result1.transpose()?;
let result3 = result2.add(&result1)?;
let (result4, _) = executor.execute_matmul(&result3, &result1)?;
let _ = result4.sum();
Ok(start.elapsed().as_millis() as f64)
};
let perform_standard_operations_f64 =
|a: &Tensor<f64>, b: &Tensor<f64>| -> rustorch::error::RusTorchResult<f64> {
let start = Instant::now();
let result1 = a.matmul(b)?;
let result2 = result1.transpose()?;
let result3 = result2.add(&result1)?;
let result4 = result3.matmul(&result1)?;
let _ = result4.sum();
Ok(start.elapsed().as_millis() as f64)
};
println!("\n💻 CPU-Only Test:");
println!(" 🔥 Standard operation chain on CPU (f64)");
let mut cpu_times = Vec::new();
for i in 0..iterations {
println!(" 💻 CPU iteration {}/{}", i + 1, iterations);
let time = perform_standard_operations_f64(&matrix_a_f64, &matrix_b_f64)?;
cpu_times.push(time);
println!(" ⏱️ CPU operations: {:.0}ms", time);
}
let cpu_avg = cpu_times.iter().sum::<f64>() / iterations as f64;
println!(" 💻 CPU average: {:.0}ms per operation chain", cpu_avg);
println!("\n⚡ Metal GPU-Only Test:");
println!(" 🔥 Standard operation chain on Metal GPU (f32)");
let mut metal_times = Vec::new();
for i in 0..iterations {
println!(" ⚡ Metal GPU iteration {}/{}", i + 1, iterations);
let time =
perform_standard_operations_f32(&matrix_a_f32, &matrix_b_f32, &mut hybrid_executor)?;
metal_times.push(time);
println!(" ⏱️ Metal GPU operations: {:.0}ms", time);
}
let metal_avg = metal_times.iter().sum::<f64>() / iterations as f64;
println!(
" ⚡ Metal GPU average: {:.0}ms per operation chain",
metal_avg
);
let perform_existing_hybrid_operations = |a: &Tensor<f64>,
b: &Tensor<f64>,
executor: &HybridExecutor|
-> rustorch::error::RusTorchResult<f64> {
use rustorch::gpu::hybrid_executor::HybridExecution;
let start = Instant::now();
let result1 = a.hybrid_operation(OpType::LinearAlgebra, |device| {
if device == DeviceType::Cpu {
return Err(rustorch::error::RusTorchError::tensor_op(
"CPU fallback prohibited - GPU execution required",
));
}
println!(" 🎯 Executing matmul on device: {:?}", device);
a.matmul(b)
})?;
let result2 = result1.hybrid_operation(OpType::LinearAlgebra, |device| {
if device == DeviceType::Cpu {
return Err(rustorch::error::RusTorchError::tensor_op(
"CPU fallback prohibited - GPU execution required",
));
}
println!(" 🎯 Executing transpose on device: {:?}", device);
result1.transpose()
})?;
let result3 = result2.hybrid_operation(OpType::LinearAlgebra, |device| {
if device == DeviceType::Cpu {
return Err(rustorch::error::RusTorchError::tensor_op(
"CPU fallback prohibited - GPU execution required",
));
}
println!(" 🎯 Executing add on device: {:?}", device);
result2.add(&result1)
})?;
let result4 = result3.hybrid_operation(OpType::LinearAlgebra, |device| {
if device == DeviceType::Cpu {
return Err(rustorch::error::RusTorchError::tensor_op(
"CPU fallback prohibited - GPU execution required",
));
}
println!(" 🎯 Executing final matmul on device: {:?}", device);
result3.matmul(&result1)
})?;
let _ = result4.sum();
Ok(start.elapsed().as_millis() as f64)
};
println!("\n🔄 True Existing Hybrid Test (NO CPU Fallback):");
println!(" 🔥 HybridExecution trait with GPU enforcement");
let mut existing_times = Vec::new();
for i in 0..iterations {
println!(
" 🔄 True existing hybrid iteration {}/{}",
i + 1,
iterations
);
println!(" 🔄 Using HybridExecution trait with CPU fallback prohibition");
let time = perform_existing_hybrid_operations(
&matrix_a_f64,
&matrix_b_f64,
&existing_hybrid_executor,
)?;
existing_times.push(time);
println!(" ⏱️ True existing hybrid operations: {:.0}ms", time);
}
let existing_avg = existing_times.iter().sum::<f64>() / iterations as f64;
println!(
" 🔄 True existing hybrid average: {:.0}ms per operation chain",
existing_avg
);
println!("\n🚀 Hybrid_f32 Test:");
println!(" 🔥 Standard operation chain with hybrid_f32 (f32)");
let mut f32_times = Vec::new();
for i in 0..iterations {
println!(" 🚀 Hybrid_f32 iteration {}/{}", i + 1, iterations);
println!(" 🚀 f32 unified execution with zero conversion cost");
println!(" 📊 Automatic device selection for optimal performance");
let time =
perform_standard_operations_f32(&matrix_a_f32, &matrix_b_f32, &mut hybrid_executor)?;
f32_times.push(time);
println!(" ⏱️ Hybrid_f32 operations: {:.0}ms", time);
}
let f32_avg = f32_times.iter().sum::<f64>() / iterations as f64;
println!(
" 🚀 Hybrid_f32 average: {:.0}ms per operation chain",
f32_avg
);
println!(
"\n📊 Focused Comparison Analysis for {}x{} matrix:",
test_size, test_size
);
println!(" Operation chain: matmul → transpose → add → matmul → sum");
println!(
" Memory per matrix: {:.1} GB",
(test_size * test_size * 4) as f64 / 1_000_000_000.0
);
println!();
println!(" 💻 CPU-Only: {:.0}ms per chain", cpu_avg);
println!(" ⚡ Metal GPU-Only: {:.0}ms per chain", metal_avg);
println!(" 🔄 True Existing Hybrid: {:.0}ms per chain", existing_avg);
println!(" 🚀 Hybrid_f32: {:.0}ms per chain", f32_avg);
let speedup_metal = cpu_avg / metal_avg;
let speedup_existing = cpu_avg / existing_avg;
let speedup_f32 = cpu_avg / f32_avg;
println!("\n🏃 Focused Speedup Analysis (vs CPU):");
println!(" Metal GPU vs CPU: {:.2}x speedup", speedup_metal);
println!(
" True Existing Hybrid vs CPU: {:.2}x speedup",
speedup_existing
);
println!(" Hybrid_f32 vs CPU: {:.2}x speedup", speedup_f32);
let metal_vs_existing = existing_avg / metal_avg;
let metal_vs_f32 = f32_avg / metal_avg;
let f32_vs_existing = existing_avg / f32_avg;
println!("\n🔬 Relative Performance Analysis:");
println!(
" Metal GPU vs True Existing Hybrid: {:.2}x ratio",
metal_vs_existing
);
println!(" Metal GPU vs Hybrid_f32: {:.2}x ratio", metal_vs_f32);
println!(
" Hybrid_f32 vs True Existing Hybrid: {:.2}x ratio",
f32_vs_existing
);
let times = [cpu_avg, metal_avg, existing_avg, f32_avg];
let best_time = times.iter().fold(f64::INFINITY, |a, &b| a.min(b));
if (best_time - cpu_avg).abs() < best_time * 0.01 {
println!(" 🏆 Focused Winner: CPU-Only");
} else if (best_time - metal_avg).abs() < best_time * 0.01 {
println!(" 🏆 Focused Winner: Metal GPU-Only");
} else if (best_time - existing_avg).abs() < best_time * 0.01 {
println!(" 🏆 Focused Winner: True Existing Hybrid");
} else {
println!(" 🏆 Focused Winner: Hybrid_f32");
}
let max_speedup = [speedup_metal, speedup_existing, speedup_f32]
.iter()
.fold(0.0f64, |a, &b| a.max(b));
if max_speedup > 5.0 {
println!(" 🚀 Exceptional acceleration achieved!");
} else if max_speedup > 2.0 {
println!(" 🚀 Excellent acceleration achieved!");
} else if max_speedup > 1.5 {
println!(" ✅ Good acceleration achieved");
} else if max_speedup > 1.2 {
println!(" 📈 Modest acceleration achieved");
} else {
println!(" ⚠️ Limited acceleration observed");
}
println!("\n🎯 Hybrid_f32 Advantages:");
if f32_vs_existing > 1.5 {
println!(
" 🎯 Major advantage over true existing hybrid ({:.2}x faster)",
f32_vs_existing
);
} else if f32_vs_existing > 1.2 {
println!(
" 📈 Significant advantage over true existing hybrid ({:.2}x faster)",
f32_vs_existing
);
} else if f32_vs_existing > 1.05 {
println!(
" 📈 Moderate advantage over true existing hybrid ({:.2}x faster)",
f32_vs_existing
);
} else {
println!(" ⚖️ Similar performance to true existing hybrid");
}
if metal_vs_f32 > 0.95 && metal_vs_f32 < 1.05 {
println!(" ✅ Hybrid_f32 matches Metal GPU performance");
} else if metal_vs_f32 > 1.05 {
println!(
" 🚀 Hybrid_f32 outperforms Metal GPU ({:.2}x faster)",
1.0 / metal_vs_f32
);
} else {
println!(
" 📊 Metal GPU slightly faster than Hybrid_f32 ({:.2}x)",
metal_vs_f32
);
}
println!("\n✅ Focused comparison benchmark completed!");
println!("📝 Clear performance hierarchy established across all key execution modes");
Ok(())
}
#[cfg(not(feature = "hybrid-f32"))]
fn main() {
println!("❌ This benchmark requires the 'hybrid-f32' feature to be enabled.");
println!("📋 Run with: timeout 1800 cargo run --example focused_comparison_benchmark --features hybrid-f32 --release");
}