use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion, Throughput};
use feral::dense::schur_kernel::{axpy2_minus, axpy_minus};
#[cfg(target_arch = "aarch64")]
use feral::dense::schur_kernel::{
axpy2_minus_direct, axpy2_minus_unroll4, axpy2_minus_unroll4_nofma, axpy_minus_direct,
axpy_minus_unroll4, axpy_minus_unroll4_nofma,
};
struct Xorshift64(u64);
impl Xorshift64 {
fn new(seed: u64) -> Self {
Self(if seed == 0 {
0x9E37_79B9_7F4A_7C15
} else {
seed
})
}
fn next_u64(&mut self) -> u64 {
let mut x = self.0;
x ^= x << 13;
x ^= x >> 7;
x ^= x << 17;
self.0 = x;
x
}
fn next_f64(&mut self) -> f64 {
let bits = (self.next_u64() >> 12) | 0x3FF0_0000_0000_0000;
let x = f64::from_bits(bits) - 1.0;
2.0 * x - 1.0
}
}
#[inline(never)]
fn scalar_axpy_minus(dst: &mut [f64], src: &[f64], alpha: f64) {
for i in 0..dst.len() {
let tmp = alpha * src[i];
dst[i] -= tmp;
}
}
#[inline(never)]
fn scalar_axpy2_minus(dst: &mut [f64], src0: &[f64], alpha0: f64, src1: &[f64], alpha1: f64) {
for i in 0..dst.len() {
let t0 = alpha0 * src0[i];
let t1 = alpha1 * src1[i];
dst[i] -= t0 + t1;
}
}
const LENGTHS: &[usize] = &[8, 16, 32, 64, 128, 256, 512, 1024, 2048];
fn make_vec(rng: &mut Xorshift64, len: usize) -> Vec<f64> {
(0..len).map(|_| rng.next_f64()).collect()
}
fn bench_axpy_minus(c: &mut Criterion) {
let mut group = c.benchmark_group("axpy_minus");
let mut rng = Xorshift64::new(0xA1B2_C3D4_E5F6_0789);
for &len in LENGTHS {
let src = make_vec(&mut rng, len);
let dst_init = make_vec(&mut rng, len);
let alpha = rng.next_f64() * 1.5;
group.throughput(Throughput::Elements(len as u64));
let dst_s = dst_init.clone();
group.bench_with_input(
BenchmarkId::new("scalar", len),
&(src.clone(), dst_s, alpha),
|b, (s, d, a)| {
let s = s.clone();
let mut d = d.clone();
b.iter(|| {
scalar_axpy_minus(black_box(&mut d), black_box(&s), black_box(*a));
});
},
);
let dst_p = dst_init.clone();
group.bench_with_input(
BenchmarkId::new("pulp", len),
&(src.clone(), dst_p, alpha),
|b, (s, d, a)| {
let s = s.clone();
let mut d = d.clone();
b.iter(|| {
axpy_minus(black_box(&mut d), black_box(&s), black_box(*a));
});
},
);
#[cfg(target_arch = "aarch64")]
{
let dst_d = dst_init.clone();
group.bench_with_input(
BenchmarkId::new("direct_neon", len),
&(src.clone(), dst_d, alpha),
|b, (s, d, a)| {
let s = s.clone();
let mut d = d.clone();
b.iter(|| {
axpy_minus_direct(black_box(&mut d), black_box(&s), black_box(*a));
});
},
);
let dst_u = dst_init.clone();
group.bench_with_input(
BenchmarkId::new("unroll4_neon", len),
&(src.clone(), dst_u, alpha),
|b, (s, d, a)| {
let s = s.clone();
let mut d = d.clone();
b.iter(|| {
axpy_minus_unroll4(black_box(&mut d), black_box(&s), black_box(*a));
});
},
);
let dst_n = dst_init.clone();
group.bench_with_input(
BenchmarkId::new("unroll4_nofma_neon", len),
&(src.clone(), dst_n, alpha),
|b, (s, d, a)| {
let s = s.clone();
let mut d = d.clone();
b.iter(|| {
axpy_minus_unroll4_nofma(black_box(&mut d), black_box(&s), black_box(*a));
});
},
);
}
}
group.finish();
}
fn bench_axpy2_minus(c: &mut Criterion) {
let mut group = c.benchmark_group("axpy2_minus");
let mut rng = Xorshift64::new(0xDEAD_BEEF_CAFE_BABE);
for &len in LENGTHS {
let src0 = make_vec(&mut rng, len);
let src1 = make_vec(&mut rng, len);
let dst_init = make_vec(&mut rng, len);
let alpha0 = rng.next_f64() * 1.5;
let alpha1 = rng.next_f64() * 1.5;
group.throughput(Throughput::Elements(len as u64));
let dst_s = dst_init.clone();
group.bench_with_input(
BenchmarkId::new("scalar", len),
&(src0.clone(), src1.clone(), dst_s, alpha0, alpha1),
|b, (s0, s1, d, a0, a1)| {
let s0 = s0.clone();
let s1 = s1.clone();
let mut d = d.clone();
b.iter(|| {
scalar_axpy2_minus(
black_box(&mut d),
black_box(&s0),
black_box(*a0),
black_box(&s1),
black_box(*a1),
);
});
},
);
let dst_p = dst_init.clone();
group.bench_with_input(
BenchmarkId::new("pulp", len),
&(src0.clone(), src1.clone(), dst_p, alpha0, alpha1),
|b, (s0, s1, d, a0, a1)| {
let s0 = s0.clone();
let s1 = s1.clone();
let mut d = d.clone();
b.iter(|| {
axpy2_minus(
black_box(&mut d),
black_box(&s0),
black_box(*a0),
black_box(&s1),
black_box(*a1),
);
});
},
);
#[cfg(target_arch = "aarch64")]
{
let dst_d = dst_init.clone();
group.bench_with_input(
BenchmarkId::new("direct_neon", len),
&(src0.clone(), src1.clone(), dst_d, alpha0, alpha1),
|b, (s0, s1, d, a0, a1)| {
let s0 = s0.clone();
let s1 = s1.clone();
let mut d = d.clone();
b.iter(|| {
axpy2_minus_direct(
black_box(&mut d),
black_box(&s0),
black_box(*a0),
black_box(&s1),
black_box(*a1),
);
});
},
);
let dst_u = dst_init.clone();
group.bench_with_input(
BenchmarkId::new("unroll4_neon", len),
&(src0.clone(), src1.clone(), dst_u, alpha0, alpha1),
|b, (s0, s1, d, a0, a1)| {
let s0 = s0.clone();
let s1 = s1.clone();
let mut d = d.clone();
b.iter(|| {
axpy2_minus_unroll4(
black_box(&mut d),
black_box(&s0),
black_box(*a0),
black_box(&s1),
black_box(*a1),
);
});
},
);
let dst_n = dst_init.clone();
group.bench_with_input(
BenchmarkId::new("unroll4_nofma_neon", len),
&(src0.clone(), src1.clone(), dst_n, alpha0, alpha1),
|b, (s0, s1, d, a0, a1)| {
let s0 = s0.clone();
let s1 = s1.clone();
let mut d = d.clone();
b.iter(|| {
axpy2_minus_unroll4_nofma(
black_box(&mut d),
black_box(&s0),
black_box(*a0),
black_box(&s1),
black_box(*a1),
);
});
},
);
}
}
group.finish();
}
criterion_group!(benches, bench_axpy_minus, bench_axpy2_minus);
criterion_main!(benches);