use std::time::Instant;
use std::hint::black_box;
fn bench<F: FnMut(&mut [u32])>(name: &str, data: &[u32], runs: usize, mut f: F) -> f64 {
for _ in 0..3 {
let mut v = data.to_vec();
f(&mut v);
black_box(&v);
}
let mut times = Vec::with_capacity(runs);
for _ in 0..runs {
let mut v = data.to_vec();
let t0 = Instant::now();
f(&mut v);
let elapsed = t0.elapsed();
black_box(&v);
for w in v.windows(2) {
assert!(w[0] <= w[1], "INCORRECT at {}!", name);
}
times.push(elapsed.as_secs_f64());
}
times.sort_by(|a, b| a.partial_cmp(b).unwrap());
let median = times[times.len() / 2];
let mkeys = data.len() as f64 / median / 1e6;
println!(" {name:40} {:8.2} ms {:8.0} Mk/s", median * 1000.0, mkeys);
median
}
fn main() {
let mut sorter = forge_sort::GpuSorter::new().expect("GPU init failed");
println!("================================================================");
println!(" Rust std::sort_unstable (ipnsort) vs forge-sort (GPU radix)");
println!(" Same random u32 data, same process, median of 7 runs");
println!("================================================================\n");
for &n in &[10_000, 50_000, 100_000, 500_000, 1_000_000, 4_000_000, 16_000_000] {
let mut rng: u64 = 0xdeadbeef12345678;
let data: Vec<u32> = (0..n).map(|_| {
rng ^= rng << 13;
rng ^= rng >> 7;
rng ^= rng << 17;
rng as u32
}).collect();
println!("--- {:>12} elements ---", format_num(n));
let cpu_ms = bench("sort_unstable (CPU ipnsort)", &data, 7, |v| {
v.sort_unstable();
});
let gpu_ms = bench("forge_sort::sort_u32 (GPU radix)", &data, 7, |v| {
sorter.sort_u32(v).unwrap();
});
let speedup = cpu_ms / gpu_ms;
if speedup >= 1.0 {
println!(" >>> GPU wins: {:.1}x faster\n", speedup);
} else {
println!(" >>> CPU wins: {:.1}x faster\n", 1.0 / speedup);
}
}
}
fn format_num(n: usize) -> String {
if n >= 1_000_000 {
format!("{}M", n / 1_000_000)
} else if n >= 1_000 {
format!("{}K", n / 1_000)
} else {
format!("{}", n)
}
}