rustygraph 0.4.2

//! Realistic GPU vs CPU Benchmark
//!
//! This benchmark demonstrates the current state of GPU acceleration:
//! - GPU uses naive O(n²) algorithm (easy to parallelize)
//! - CPU uses optimized O(n) algorithm (harder to parallelize)
//!
//! This shows why GPU is currently slower and finds different edges.
//!
//! Run with: cargo run --example gpu_realistic_benchmark --features metal --release

use rustygraph::TimeSeries;
use std::time::Instant;

#[cfg(all(target_os = "macos", target_arch = "aarch64", feature = "metal"))]
use rustygraph::performance::{GpuVisibilityGraph, GpuConfig, GpuCapabilities};

fn generate_test_data(size: usize) -> Vec<f64> {
    (0..size)
        .map(|i| {
            let x = i as f64 * 0.1;
            x.sin() * 100.0 + 100.0
        })
        .collect()
}

fn main() {
    println!("🔬 Realistic GPU vs CPU Benchmark\n");
    println!("{}", "=".repeat(80));

    #[cfg(all(target_os = "macos", target_arch = "aarch64", feature = "metal"))]
    {
        let caps = GpuCapabilities::detect();
        println!("\n📊 Hardware:");
        caps.print_info();

        if !caps.has_metal() {
            println!("\n⚠️  Metal GPU not available.");
            return;
        }

        println!("\n{}", "=".repeat(80));
        println!("\n⚠️  IMPORTANT NOTE:");
        println!("{}", "-".repeat(80));
        println!("GPU Implementation: Naive O(n²) algorithm (easy to parallelize)");
        println!("CPU Implementation: Optimized O(n) algorithm (monotonic stack)");
        println!("\nThis is why GPU is slower - it's doing more work per edge!");
        println!("This is a common trade-off in GPU programming.");

        println!("\n{}", "=".repeat(80));
        println!("\n📈 Performance Comparison");
        println!("{}", "=".repeat(80));
        println!("{:<8} {:>12} {:>12} {:>10} {:>10} {:>10}",
                 "Size", "GPU Time", "CPU Time", "GPU Edges", "CPU Edges", "Speedup");
        println!("{}", "-".repeat(80));

        // Test reasonable sizes
        let test_sizes = vec![100, 500, 1000, 2000, 5000];

        for &size in &test_sizes {
            let data = generate_test_data(size);
            let series = TimeSeries::from_raw(data.clone()).unwrap();

            // GPU test
            let config = GpuConfig::for_apple_silicon().with_min_nodes(0);
            let gpu = GpuVisibilityGraph::with_config(config);

            let start = Instant::now();
            let graph_gpu = gpu.build_natural(&series).unwrap();
            let gpu_time = start.elapsed();
            let gpu_edges = graph_gpu.edges().len();

            // CPU test
            let start = Instant::now();
            let graph_cpu = rustygraph::VisibilityGraph::from_series(&series)
                .natural_visibility()
                .unwrap();
            let cpu_time = start.elapsed();
            let cpu_edges = graph_cpu.edges().len();

            let speedup = cpu_time.as_secs_f64() / gpu_time.as_secs_f64();

            println!("{:<8} {:>12.2?} {:>12.2?} {:>10} {:>10} {:>9.2}x",
                     size, gpu_time, cpu_time, gpu_edges, cpu_edges, speedup);
        }

        println!("\n{}", "=".repeat(80));
        println!("\n🔍 Analysis:");
        println!("{}", "-".repeat(80));

        println!("\n❌ Why GPU is Slower:");
        println!("  1. Naive O(n²) algorithm vs optimized O(n) on CPU");
        println!("  2. GPU overhead: 50-70ms per graph");
        println!("  3. CPU already has excellent SIMD + parallel optimizations");
        println!("  4. Memory bandwidth limited, not compute limited");

        println!("\n❌ Why Edge Counts Differ:");
        println!("  1. Different algorithms (naive vs optimized)");
        println!("  2. Float precision handling (EPSILON in GPU)");
        println!("  3. Both are trying to compute same thing, but implementations differ");

        println!("\n✅ What Would Make GPU Faster:");
        println!("  1. Implement optimized O(n) algorithm on GPU (much harder)");
        println!("  2. Process multiple graphs in batch (amortize overhead)");
        println!("  3. Use GPU for truly massive graphs (> 100k nodes)");
        println!("  4. Optimize for GPU memory patterns");

        println!("\n💡 Current Recommendation:");
        println!("  → Use CPU for all practical purposes");
        println!("  → GPU implementation is educational but not production-ready");
        println!("  → CPU SIMD + Parallel gives 10-30x speedup already");

        println!("\n{}", "=".repeat(80));
    }

    #[cfg(not(all(target_os = "macos", target_arch = "aarch64", feature = "metal")))]
    {
        println!("⚠️  This benchmark requires Apple Silicon and the 'metal' feature.");
    }
}