forge-sort 0.2.3

GPU radix sort for Apple Silicon — 4,800+ Mkeys/s, 6 data types, zero-copy
//! Head-to-head: Rust std sort_unstable (ipnsort) vs forge-sort (GPU radix)
//! Same data, same process, fair comparison.

use std::time::Instant;
use std::hint::black_box;

fn bench<F: FnMut(&mut [u32])>(name: &str, data: &[u32], runs: usize, mut f: F) -> f64 {
    // Warmup
    for _ in 0..3 {
        let mut v = data.to_vec();
        f(&mut v);
        black_box(&v);
    }

    let mut times = Vec::with_capacity(runs);
    for _ in 0..runs {
        let mut v = data.to_vec();
        let t0 = Instant::now();
        f(&mut v);
        let elapsed = t0.elapsed();
        black_box(&v);

        // Verify
        for w in v.windows(2) {
            assert!(w[0] <= w[1], "INCORRECT at {}!", name);
        }
        times.push(elapsed.as_secs_f64());
    }

    times.sort_by(|a, b| a.partial_cmp(b).unwrap());
    let median = times[times.len() / 2];
    let mkeys = data.len() as f64 / median / 1e6;
    println!("  {name:40}  {:8.2} ms  {:8.0} Mk/s", median * 1000.0, mkeys);
    median
}

fn main() {
    let mut sorter = forge_sort::GpuSorter::new().expect("GPU init failed");

    println!("================================================================");
    println!("  Rust std::sort_unstable (ipnsort)  vs  forge-sort (GPU radix)");
    println!("  Same random u32 data, same process, median of 7 runs");
    println!("================================================================\n");

    for &n in &[10_000, 50_000, 100_000, 500_000, 1_000_000, 4_000_000, 16_000_000] {
        // Generate identical random data
        let mut rng: u64 = 0xdeadbeef12345678;
        let data: Vec<u32> = (0..n).map(|_| {
            rng ^= rng << 13;
            rng ^= rng >> 7;
            rng ^= rng << 17;
            rng as u32
        }).collect();

        println!("--- {:>12} elements ---", format_num(n));

        let cpu_ms = bench("sort_unstable (CPU ipnsort)", &data, 7, |v| {
            v.sort_unstable();
        });

        let gpu_ms = bench("forge_sort::sort_u32 (GPU radix)", &data, 7, |v| {
            sorter.sort_u32(v).unwrap();
        });

        let speedup = cpu_ms / gpu_ms;
        if speedup >= 1.0 {
            println!("  >>> GPU wins: {:.1}x faster\n", speedup);
        } else {
            println!("  >>> CPU wins: {:.1}x faster\n", 1.0 / speedup);
        }
    }
}

fn format_num(n: usize) -> String {
    if n >= 1_000_000 {
        format!("{}M", n / 1_000_000)
    } else if n >= 1_000 {
        format!("{}K", n / 1_000)
    } else {
        format!("{}", n)
    }
}