use scirs2_core::ndarray_ext::Array1;
use std::time::Instant;
#[cfg(feature = "simd")]
use scirs2_core::simd::{
simd_mul_f32, simd_mul_f32_branchfree, simd_mul_f32_cacheline, simd_mul_f32_hyperoptimized,
simd_mul_f32_pipelined, simd_mul_f32_tlb_optimized,
};
#[cfg(not(feature = "simd"))]
fn main() {
println!("This example requires the 'simd' feature to be enabled.");
println!("Run with: cargo run --example hyper_performance_benchmark --features simd");
}
#[cfg(feature = "simd")]
fn benchmark_operation<F>(name: &str, size: usize, iterations: usize, mut op: F) -> f64
where
F: FnMut(),
{
for _ in 0..10 {
op();
}
let start = Instant::now();
for _ in 0..iterations {
op();
}
let elapsed = start.elapsed();
let throughput = (size as f64 * iterations as f64 * 4.0) / elapsed.as_secs_f64() / 1e9;
println!(
"{:<30} | Time: {:>8.2} ms | Throughput: {:>6.2} GB/s",
name,
elapsed.as_millis(),
throughput
);
elapsed.as_secs_f64() * 1000.0 / iterations as f64
}
#[cfg(feature = "simd")]
fn run_benchmark_suite(size: usize, iterations: usize) {
println!("\n{}", "=".repeat(75));
println!(
"Array size: {} elements ({:.2} MB)",
size,
size as f64 * 4.0 / 1_048_576.0
);
println!("Iterations: {}", iterations);
println!("{}", "-".repeat(75));
let a = Array1::from_vec(vec![1.5f32; size]);
let b = Array1::from_vec(vec![2.5f32; size]);
let scalar_time = benchmark_operation("Scalar baseline", size, iterations, || {
let mut result = vec![0.0f32; size];
for i in 0..size {
result[i] = a[i] * b[i];
}
std::hint::black_box(&result);
});
let standard_time = benchmark_operation("Standard SIMD", size, iterations, || {
let result = simd_mul_f32(&a.view(), &b.view());
std::hint::black_box(&result);
});
let branchfree_time = benchmark_operation("Branch-free SIMD", size, iterations, || {
let result = simd_mul_f32_branchfree(&a.view(), &b.view());
std::hint::black_box(&result);
});
let cacheline_time = benchmark_operation("Cache-line aware", size, iterations, || {
let result = simd_mul_f32_cacheline(&a.view(), &b.view());
std::hint::black_box(&result);
});
let pipelined_time = benchmark_operation("Software pipelined", size, iterations, || {
let result = simd_mul_f32_pipelined(&a.view(), &b.view());
std::hint::black_box(&result);
});
let tlb_time = benchmark_operation("TLB-optimized", size, iterations, || {
let result = simd_mul_f32_tlb_optimized(&a.view(), &b.view());
std::hint::black_box(&result);
});
let hyper_time = benchmark_operation("HYPER-OPTIMIZED", size, iterations, || {
let result = simd_mul_f32_hyperoptimized(&a.view(), &b.view());
std::hint::black_box(&result);
});
println!("{}", "-".repeat(75));
println!("Performance vs Scalar:");
println!(" Standard SIMD: {:.2}x", scalar_time / standard_time);
println!(
" Branch-free SIMD: {:.2}x",
scalar_time / branchfree_time
);
println!(" Cache-line aware: {:.2}x", scalar_time / cacheline_time);
println!(" Software pipelined: {:.2}x", scalar_time / pipelined_time);
println!(" TLB-optimized: {:.2}x", scalar_time / tlb_time);
println!(" HYPER-OPTIMIZED: {:.2}x", scalar_time / hyper_time);
let mut times = [
("Standard SIMD", standard_time),
("Branch-free SIMD", branchfree_time),
("Cache-line aware", cacheline_time),
("Software pipelined", pipelined_time),
("TLB-optimized", tlb_time),
("HYPER-OPTIMIZED", hyper_time),
];
times.sort_by(|a, b| a.1.partial_cmp(&b.1).expect("Operation failed"));
println!("{}", "-".repeat(75));
println!(
"🏆 WINNER: {} ({:.2}x faster than scalar)",
times[0].0,
scalar_time / times[0].1
);
}
#[cfg(feature = "simd")]
fn main() {
println!("\n╔{}╗", "═".repeat(73));
println!("║{:^73}║", "HYPER-PERFORMANCE SIMD BENCHMARK");
println!("║{:^73}║", "Testing Ultra-Optimized Implementations");
println!("╚{}╝", "═".repeat(73));
let test_cases = vec![
(256, 10000), (4096, 5000), (65536, 1000), (524288, 100), (4194304, 10), (16777216, 5), ];
for (size, iterations) in test_cases {
run_benchmark_suite(size, iterations);
}
println!("\n╔{}╗", "═".repeat(73));
println!("║{:^73}║", "BENCHMARK COMPLETE");
println!("╚{}╝", "═".repeat(73));
println!("\n📊 Correctness Verification:");
let a = Array1::from_vec(vec![1.0, 2.0, 3.0, 4.0, 5.0]);
let b = Array1::from_vec(vec![2.0, 3.0, 4.0, 5.0, 6.0]);
let standard = simd_mul_f32(&a.view(), &b.view());
let cacheline = simd_mul_f32_cacheline(&a.view(), &b.view());
let pipelined = simd_mul_f32_pipelined(&a.view(), &b.view());
let tlb = simd_mul_f32_tlb_optimized(&a.view(), &b.view());
let hyper = simd_mul_f32_hyperoptimized(&a.view(), &b.view());
let all_correct =
standard == cacheline && standard == pipelined && standard == tlb && standard == hyper;
if all_correct {
println!("✅ All implementations produce identical results!");
} else {
println!("❌ Implementation mismatch detected!");
}
}