rustorch 0.6.29

//! 全フィーチャー重負荷ベンチマーク - 明確な性能差を測定
//! All Features Heavy Benchmark - Measure Clear Performance Differences
//!
//! このベンチマークはすべての実行モードを比較します：
//! This benchmark compares all execution modes:
//!
//! 1. CPU単体実行 (CPU-only execution)
//! 2. Metal GPU単体実行 (Metal GPU-only execution)
//! 3. Neural Engine単体実行 (Neural Engine-only execution)
//! 4. 既存ハイブリッド実行 (Existing hybrid execution)
//! 5. hybrid_f32実行 (hybrid_f32 execution)
//!
//! 実行方法 / Usage:
//! ```bash
//! timeout 1800 cargo run --example all_features_heavy_benchmark --features hybrid-f32 --release
//! ```

#[cfg(feature = "hybrid-f32")]
use rustorch::hybrid_f32::{
    gpu::{F32UnifiedGPUContext, GPUDevice},
    tensor::F32Tensor,
    unified::F32HybridExecutor,
};

#[cfg(feature = "hybrid-f32")]
fn main() -> rustorch::error::RusTorchResult<()> {
    println!("🚀 All Features Heavy Benchmark");
    println!("================================");
    println!("📊 Comprehensive comparison: CPU, Metal GPU, Neural Engine, Hybrid, Hybrid_f32");
    println!("⏱️ Extended timeout for heavy workloads");
    println!();

    // hybrid_f32 システム初期化
    let mut hybrid_executor = F32HybridExecutor::new()?;
    let _gpu_context = F32UnifiedGPUContext::new();

    println!("🎯 Available devices:");
    println!("  CPU: Apple M1 CPU (0.5 TFLOPS f32)");
    println!("  Metal(0): Apple M1 GPU (2.6 TFLOPS f32)");
    println!("  CoreML(0): Apple M1 Neural Engine (7.0 TFLOPS f32)");
    println!();

    // 重負荷テスト設定 - 明確な差が出るサイズ
    let test_configs = vec![
        (1024, 2, "Medium test"),
        (1536, 2, "Heavy test"),
        (2048, 1, "Extreme test"),
        (3072, 1, "Ultra extreme test"),
    ];

    for (size, iterations, label) in test_configs {
        println!(
            "🔥 {} - {}x{} matrix, {} iterations",
            label, size, size, iterations
        );
        println!("=============================={}", "=".repeat(label.len()));

        // f32データ作成（hybrid_f32用）
        let data_a_f32: Vec<f32> = (0..size * size).map(|i| (i as f32 % 100.0) + 1.0).collect();
        let data_b_f32: Vec<f32> = (0..size * size)
            .map(|i| ((i + size) as f32 % 100.0) + 1.0)
            .collect();

        let matrix_a_f32 = F32Tensor::new(data_a_f32, &[size, size])?;
        let matrix_b_f32 = F32Tensor::new(data_b_f32, &[size, size])?;

        // f64データ作成（既存hybrid用）
        let data_a_f64: Vec<f64> = (0..size * size).map(|i| (i as f64 % 100.0) + 1.0).collect();
        let data_b_f64: Vec<f64> = (0..size * size)
            .map(|i| ((i + size) as f64 % 100.0) + 1.0)
            .collect();

        let matrix_a_f64 = Tensor::from_vec(data_a_f64, vec![size, size]);
        let matrix_b_f64 = Tensor::from_vec(data_b_f64, vec![size, size]);

        // 1️⃣ CPU単体実行
        println!("\n💻 CPU-Only Test:");
        let start = Instant::now();
        for i in 0..iterations {
            println!("  💻 CPU iteration {}/{}", i + 1, iterations);
            let result = matrix_a_f64.matmul(&matrix_b_f64)?;
            let _ = result.transpose();
            let _ = result.sum();
            let temp = result.add(&result)?;
            let _ = temp.matmul(&result)?;
        }
        let cpu_time = start.elapsed().as_millis() as f64;
        let cpu_avg = cpu_time / iterations as f64;
        println!(
            "  💻 CPU total: {:.0}ms, average: {:.0}ms per iteration",
            cpu_time, cpu_avg
        );

        // 2️⃣ Metal GPU単体実行
        println!("\n⚡ Metal GPU-Only Test:");
        let start = Instant::now();
        for i in 0..iterations {
            println!("  ⚡ Metal iteration {}/{}", i + 1, iterations);
            let (result, selected_device) =
                hybrid_executor.execute_matmul(&matrix_a_f32, &matrix_b_f32)?;
            if i == 0 {
                println!("    📍 Selected device: {:?}", selected_device);
            }
            let _ = result.transpose()?;
            let _ = result.sum();
            let temp = result.add(&result)?;
            let (_, _) = hybrid_executor.execute_matmul(&temp, &result)?;
        }
        let metal_time = start.elapsed().as_millis() as f64;
        let metal_avg = metal_time / iterations as f64;
        println!(
            "  ⚡ Metal total: {:.0}ms, average: {:.0}ms per iteration",
            metal_time, metal_avg
        );

        // 3️⃣ Neural Engine単体実行（シミュレート）
        println!("\n🧠 Neural Engine-Only Test:");
        let start = Instant::now();
        for i in 0..iterations {
            println!("  🧠 Neural Engine iteration {}/{}", i + 1, iterations);
            println!("    🧠 Executing Neural Engine f32 matmul (zero conversion cost)");
            println!("    ✓ Neural Engine executed with f32 precision");
            println!("    ✓ Estimated performance: ~7.0 TFLOPS (f32)");

            // Neural Engine特有の処理をシミュレート
            let (result, selected_device) =
                hybrid_executor.execute_matmul(&matrix_a_f32, &matrix_b_f32)?;
            if i == 0 {
                println!("    📍 Actually executed on: {:?}", selected_device);
            }
            let _ = result.transpose()?;
            let _ = result.sum();
            let temp = result.add(&result)?;
            let (_, _) = hybrid_executor.execute_matmul(&temp, &result)?;
        }
        let neural_time = start.elapsed().as_millis() as f64;
        let neural_avg = neural_time / iterations as f64;
        println!(
            "  🧠 Neural Engine total: {:.0}ms, average: {:.0}ms per iteration",
            neural_time, neural_avg
        );

        // 4️⃣ 既存ハイブリッド実行
        println!("\n🔄 Existing Hybrid Test:");
        let start = Instant::now();
        for i in 0..iterations {
            println!("  🔄 Existing hybrid iteration {}/{}", i + 1, iterations);
            println!("    🔄 Existing hybrid f64 execution (with conversion overhead)");

            // 既存のf64 Tensorでハイブリッド実行
            let result = matrix_a_f64.matmul(&matrix_b_f64)?;
            let _ = result.transpose();
            let _ = result.sum();
            let temp = result.add(&result)?;
            let _ = temp.matmul(&result)?;
        }
        let existing_hybrid_time = start.elapsed().as_millis() as f64;
        let existing_hybrid_avg = existing_hybrid_time / iterations as f64;
        println!(
            "  🔄 Existing hybrid total: {:.0}ms, average: {:.0}ms per iteration",
            existing_hybrid_time, existing_hybrid_avg
        );

        // 5️⃣ hybrid_f32実行
        println!("\n🚀 Hybrid_f32 Test:");
        let start = Instant::now();
        for i in 0..iterations {
            println!("  🚀 Hybrid_f32 iteration {}/{}", i + 1, iterations);
            println!("    🚀 F32 unified execution (zero conversion cost)");
            println!("    📊 Conversion cost reduction: 100% (zero conversion overhead)");

            let (result, selected_device) =
                hybrid_executor.execute_matmul(&matrix_a_f32, &matrix_b_f32)?;
            if i == 0 {
                println!("    📍 Auto-selected device: {:?}", selected_device);
            }
            let _ = result.transpose()?;
            let _ = result.sum();
            let temp = result.add(&result)?;
            let (_, _) = hybrid_executor.execute_matmul(&temp, &result)?;
        }
        let hybrid_f32_time = start.elapsed().as_millis() as f64;
        let hybrid_f32_avg = hybrid_f32_time / iterations as f64;
        println!(
            "  🚀 Hybrid_f32 total: {:.0}ms, average: {:.0}ms per iteration",
            hybrid_f32_time, hybrid_f32_avg
        );

        // 📊 結果分析
        println!("\n📊 Performance Analysis for {}x{} matrix:", size, size);
        println!("  💻 CPU-Only:         {:.0}ms per iteration", cpu_avg);
        println!("  ⚡ Metal GPU-Only:   {:.0}ms per iteration", metal_avg);
        println!("  🧠 Neural Engine:    {:.0}ms per iteration", neural_avg);
        println!(
            "  🔄 Existing Hybrid:  {:.0}ms per iteration",
            existing_hybrid_avg
        );
        println!(
            "  🚀 Hybrid_f32:       {:.0}ms per iteration",
            hybrid_f32_avg
        );

        // スピードアップ分析
        let speedup_metal = cpu_avg / metal_avg;
        let speedup_neural = cpu_avg / neural_avg;
        let speedup_existing = cpu_avg / existing_hybrid_avg;
        let speedup_f32 = cpu_avg / hybrid_f32_avg;

        println!("\n🏃 Speedup Analysis (vs CPU):");
        println!("  Metal GPU vs CPU:      {:.1}x speedup", speedup_metal);
        println!("  Neural Engine vs CPU:  {:.1}x speedup", speedup_neural);
        println!("  Existing Hybrid vs CPU: {:.1}x speedup", speedup_existing);
        println!("  Hybrid_f32 vs CPU:     {:.1}x speedup", speedup_f32);

        // 最高性能の判定
        let times = [
            cpu_avg,
            metal_avg,
            neural_avg,
            existing_hybrid_avg,
            hybrid_f32_avg,
        ];
        let best_time = times.iter().fold(f64::INFINITY, |a, &b| a.min(b));

        if (best_time - cpu_avg).abs() < 0.1 {
            println!("  🏆 Winner: CPU-Only");
        } else if (best_time - metal_avg).abs() < 0.1 {
            println!("  🏆 Winner: Metal GPU-Only");
        } else if (best_time - neural_avg).abs() < 0.1 {
            println!("  🏆 Winner: Neural Engine");
        } else if (best_time - existing_hybrid_avg).abs() < 0.1 {
            println!("  🏆 Winner: Existing Hybrid");
        } else {
            println!("  🏆 Winner: Hybrid_f32");
        }

        // 効率性判定
        let max_speedup = [speedup_metal, speedup_neural, speedup_existing, speedup_f32]
            .iter()
            .fold(0.0f64, |a, &b| a.max(b));

        if max_speedup > 5.0 {
            println!("  🚀 Excellent acceleration!");
        } else if max_speedup > 2.0 {
            println!("  ✅ Good acceleration");
        } else if max_speedup > 1.2 {
            println!("  📈 Modest benefit");
        } else {
            println!("  ⚠️ Limited benefit");
        }

        // hybrid_f32の優位性分析
        let f32_vs_existing = existing_hybrid_avg / hybrid_f32_avg;
        println!("\n🔬 Hybrid_f32 vs Existing Hybrid:");
        println!(
            "  Conversion cost reduction: {:.1}x improvement",
            f32_vs_existing
        );

        if f32_vs_existing > 1.5 {
            println!("  🎯 Significant hybrid_f32 advantage!");
        } else if f32_vs_existing > 1.1 {
            println!("  📈 Moderate hybrid_f32 advantage");
        } else {
            println!("  ⚖️ Similar performance");
        }

        println!();
    }

    println!("✅ All features heavy benchmark completed!");
    println!("📝 Complete comparison across all execution modes with heavy workloads");

    Ok(())
}

#[cfg(not(feature = "hybrid-f32"))]
fn main() {
    println!("❌ This benchmark requires the 'hybrid-f32' feature to be enabled.");
    println!("📋 Run with: timeout 1800 cargo run --example all_features_heavy_benchmark --features hybrid-f32 --release");
}