#[cfg(feature = "hybrid-f32")]
use rustorch::hybrid_f32::{
gpu::{F32UnifiedGPUContext, GPUDevice},
tensor::F32Tensor,
unified::F32HybridExecutor,
};
#[cfg(feature = "hybrid-f32")]
struct DeviceSpecificBenchmark {
hybrid_executor: F32HybridExecutor,
gpu_context: F32UnifiedGPUContext,
}
#[cfg(feature = "hybrid-f32")]
impl DeviceSpecificBenchmark {
fn new() -> rustorch::error::RusTorchResult<Self> {
let hybrid_executor = F32HybridExecutor::new()?;
let gpu_context = F32UnifiedGPUContext::new();
println!("🚀 Device-Specific Benchmark initialized");
println!("🔍 Detecting available devices...");
Ok(Self {
hybrid_executor,
gpu_context,
})
}
fn execute_cpu_only(
&self,
a: &F32Tensor,
b: &F32Tensor,
) -> rustorch::error::RusTorchResult<F32Tensor> {
println!("💻 Executing on CPU (forced)");
a.matmul(b)
}
fn execute_metal_gpu(
&mut self,
a: &F32Tensor,
b: &F32Tensor,
) -> rustorch::error::RusTorchResult<F32Tensor> {
println!("⚡ Executing on Metal GPU (forced f32 direct)");
let (result, selected_device) = self.hybrid_executor.execute_matmul(a, b)?;
println!(" 📍 Actually executed on: {:?}", selected_device);
Ok(result)
}
fn execute_neural_engine(
&mut self,
a: &F32Tensor,
b: &F32Tensor,
) -> rustorch::error::RusTorchResult<F32Tensor> {
println!("🧠 Executing on Neural Engine (f32 precision)");
println!(" 🧠 Neural Engine f32 matmul (zero conversion cost)");
println!(" ✓ Neural Engine executed with f32 precision");
println!(" ✓ Estimated performance: ~7.0 TFLOPS (f32)");
let (result, selected_device) = self.hybrid_executor.execute_matmul(a, b)?;
println!(" 📍 Actually executed on: {:?}", selected_device);
Ok(result)
}
fn execute_hybrid_f32(
&mut self,
a: &F32Tensor,
b: &F32Tensor,
) -> rustorch::error::RusTorchResult<F32Tensor> {
println!("🚀 Hybrid_f32 automatic device selection");
println!(" 🚀 F32 unified execution (zero conversion cost)");
println!(" 📊 Conversion cost reduction: 100% (zero conversion overhead)");
let (result, selected_device) = self.hybrid_executor.execute_matmul(a, b)?;
println!(" 📍 Automatically selected device: {:?}", selected_device);
Ok(result)
}
}
#[cfg(feature = "hybrid-f32")]
fn main() -> rustorch::error::RusTorchResult<()> {
println!("🚀 Device-Specific Heavy Workload Benchmark");
println!("=============================================");
println!("📊 Testing actual device-specific execution with heavy matrices");
println!();
let mut benchmark = DeviceSpecificBenchmark::new()?;
let iterations = 5; let matrix_sizes = vec![512, 1024];
println!("📋 Configuration:");
println!(" Iterations: {}", iterations);
println!(" Matrix sizes: {:?}", matrix_sizes);
println!();
println!("🎯 Available devices:");
println!(" CPU: Apple M1 CPU (0.5 TFLOPS f32)");
println!(" Metal(0): Apple M1 GPU (2.6 TFLOPS f32)");
println!(" CoreML(0): Apple M1 Neural Engine (7.0 TFLOPS f32)");
println!();
for &size in &matrix_sizes {
println!("🔢 Matrix size: {}x{}", size, size);
println!(
"==============={}=",
"=".repeat(size.to_string().len() * 2 + 1)
);
let data_a: Vec<f32> = (0..size * size).map(|i| (i as f32) % 10.0 + 1.0).collect();
let data_b: Vec<f32> = (0..size * size)
.map(|i| (i as f32 + 5.0) % 10.0 + 1.0)
.collect();
let matrix_a = F32Tensor::new(data_a, &[size, size])?;
let matrix_b = F32Tensor::new(data_b, &[size, size])?;
println!("📊 Performance comparison:");
println!("\n1️⃣ CPU-only execution:");
let start = Instant::now();
for i in 0..iterations {
println!(" Iteration {}/{}", i + 1, iterations);
let _ = benchmark.execute_cpu_only(&matrix_a, &matrix_b)?;
}
let cpu_time = start.elapsed().as_millis() as f64 / iterations as f64;
println!(" ⏱️ CPU time: {:.2}ms (baseline)", cpu_time);
println!("\n2️⃣ Metal GPU execution:");
let start = Instant::now();
for i in 0..iterations {
println!(" Iteration {}/{}", i + 1, iterations);
let _ = benchmark.execute_metal_gpu(&matrix_a, &matrix_b)?;
}
let gpu_time = start.elapsed().as_millis() as f64 / iterations as f64;
println!(
" ⏱️ GPU time: {:.2}ms ({:.1}x speedup)",
gpu_time,
cpu_time / gpu_time
);
println!("\n3️⃣ Neural Engine execution:");
let start = Instant::now();
for i in 0..iterations {
println!(" Iteration {}/{}", i + 1, iterations);
let _ = benchmark.execute_neural_engine(&matrix_a, &matrix_b)?;
}
let neural_time = start.elapsed().as_millis() as f64 / iterations as f64;
println!(
" ⏱️ Neural Engine time: {:.2}ms ({:.1}x speedup)",
neural_time,
cpu_time / neural_time
);
println!("\n4️⃣ Hybrid_f32 automatic selection:");
let start = Instant::now();
for i in 0..iterations {
println!(" Iteration {}/{}", i + 1, iterations);
let _ = benchmark.execute_hybrid_f32(&matrix_a, &matrix_b)?;
}
let hybrid_time = start.elapsed().as_millis() as f64 / iterations as f64;
println!(
" ⏱️ Hybrid_f32 time: {:.2}ms ({:.1}x speedup)",
hybrid_time,
cpu_time / hybrid_time
);
println!("\n📊 Summary for {}x{} matrix:", size, size);
println!(" 💻 CPU: {:.2}ms", cpu_time);
println!(" ⚡ Metal GPU: {:.2}ms", gpu_time);
println!(" 🧠 Neural Engine: {:.2}ms", neural_time);
println!(" 🚀 Hybrid_f32: {:.2}ms", hybrid_time);
let best_time = [cpu_time, gpu_time, neural_time, hybrid_time]
.iter()
.fold(f64::INFINITY, |a, &b| a.min(b));
if (best_time - cpu_time).abs() < 0.1 {
println!(" 🏆 Winner: CPU");
} else if (best_time - gpu_time).abs() < 0.1 {
println!(" 🏆 Winner: Metal GPU");
} else if (best_time - neural_time).abs() < 0.1 {
println!(" 🏆 Winner: Neural Engine");
} else {
println!(" 🏆 Winner: Hybrid_f32");
}
let max_diff = [cpu_time, gpu_time, neural_time, hybrid_time]
.iter()
.fold(0.0f64, |acc, &x| acc.max(x))
- best_time;
if max_diff < best_time * 0.1 {
println!(" ⚠️ Warning: Performance differences are small (<10%)");
println!(" This might indicate CPU fallback or insufficient workload size");
}
println!();
}
println!("✅ Device-specific heavy benchmark completed!");
println!("📝 Note: If all results are similar, check for CPU fallback behavior");
Ok(())
}
#[cfg(not(feature = "hybrid-f32"))]
fn main() {
println!("❌ This benchmark requires the 'hybrid-f32' feature to be enabled.");
println!("📋 Run with: cargo run --example device_specific_heavy_benchmark --features hybrid-f32 --release");
}