#[cfg(not(target_arch = "x86_64"))]
fn main() {}
#[cfg(target_arch = "x86_64")]
mod x86_impl {
use archmage::{SimdToken, X64V3Token, arcane};
use magetypes::simd::polyfill::v3 as poly;
use std::time::Instant;
const N: usize = 64 * 1024;
const ITERATIONS: u32 = 1000;
#[arcane]
fn sum_polyfill(token: X64V3Token, data: &[f32]) -> f32 {
let mut acc = poly::f32xN::zero(token);
let chunks = data.chunks_exact(poly::LANES_F32);
let remainder = chunks.remainder();
for chunk in chunks {
let arr: &[f32; 8] = chunk.try_into().unwrap();
let v = poly::f32xN::load(token, arr);
acc += v;
}
let mut sum = acc.reduce_add();
for &x in remainder {
sum += x;
}
sum
}
#[arcane]
fn sum_native_sse(token: archmage::X64V3Token, data: &[f32]) -> f32 {
use magetypes::simd::f32x4;
let mut acc = f32x4::zero(token);
let chunks = data.chunks_exact(4);
let remainder = chunks.remainder();
for chunk in chunks {
let arr: &[f32; 4] = chunk.try_into().unwrap();
let v = f32x4::load(token, arr);
acc += v;
}
let mut sum = acc.reduce_add();
for &x in remainder {
sum += x;
}
sum
}
#[arcane]
fn sum_native_avx2(token: archmage::X64V3Token, data: &[f32]) -> f32 {
use magetypes::simd::f32x8;
let mut acc = f32x8::zero(token);
let chunks = data.chunks_exact(8);
let remainder = chunks.remainder();
for chunk in chunks {
let arr: &[f32; 8] = chunk.try_into().unwrap();
let v = f32x8::load(token, arr);
acc += v;
}
let mut sum = acc.reduce_add();
for &x in remainder {
sum += x;
}
sum
}
fn sum_scalar(data: &[f32]) -> f32 {
data.iter().sum()
}
pub fn main() {
println!("\n╔══════════════════════════════════════════════════════════════╗");
println!("║ Polyfill SIMD Demonstration ║");
println!("╚══════════════════════════════════════════════════════════════╝\n");
let data: Vec<f32> = (0..N).map(|i| (i as f32) * 0.001).collect();
let expected = sum_scalar(&data);
println!("Expected sum: {:.2}\n", expected);
if let Some(token) = archmage::X64V3Token::summon() {
let polyfill_result = sum_polyfill(token, &data);
let native_sse_result = sum_native_sse(token, &data);
println!("Polyfilled f32x8 on SSE: {:.2}", polyfill_result);
println!("Native SSE f32x4: {:.2}", native_sse_result);
assert!(
(polyfill_result - expected).abs() / expected.abs() < 0.001,
"Polyfill result mismatch"
);
}
if let Some(token) = archmage::X64V3Token::summon() {
let native_avx2_result = sum_native_avx2(token, &data);
println!("Native AVX2 f32x8: {:.2}", native_avx2_result);
}
println!(
"\n=== Benchmarks ({} elements, {} iterations) ===\n",
N, ITERATIONS
);
let start = Instant::now();
let mut result;
for _ in 0..ITERATIONS {
result = sum_scalar(&data);
std::hint::black_box(result);
}
let scalar_time = start.elapsed();
println!(
" Scalar: {:>8.2} ms",
scalar_time.as_secs_f64() * 1000.0
);
if let Some(token) = archmage::X64V3Token::summon() {
let start = Instant::now();
for _ in 0..ITERATIONS {
result = sum_polyfill(token, &data);
std::hint::black_box(result);
}
let polyfill_time = start.elapsed();
println!(
" Polyfill f32x8: {:>8.2} ms ({:.1}x faster than scalar)",
polyfill_time.as_secs_f64() * 1000.0,
scalar_time.as_secs_f64() / polyfill_time.as_secs_f64()
);
let start = Instant::now();
for _ in 0..ITERATIONS {
result = sum_native_sse(token, &data);
std::hint::black_box(result);
}
let native_sse_time = start.elapsed();
println!(
" Native SSE f32x4: {:>8.2} ms ({:.1}x faster than scalar)",
native_sse_time.as_secs_f64() * 1000.0,
scalar_time.as_secs_f64() / native_sse_time.as_secs_f64()
);
}
if let Some(token) = archmage::X64V3Token::summon() {
let start = Instant::now();
for _ in 0..ITERATIONS {
result = sum_native_avx2(token, &data);
std::hint::black_box(result);
}
let native_avx2_time = start.elapsed();
println!(
" Native AVX2 f32x8:{:>8.2} ms ({:.1}x faster than scalar)",
native_avx2_time.as_secs_f64() * 1000.0,
scalar_time.as_secs_f64() / native_avx2_time.as_secs_f64()
);
}
println!("\nDone!");
}
}
#[cfg(target_arch = "x86_64")]
fn main() {
x86_impl::main()
}