use crate::acceleration::simd::Simd;
pub(crate) trait SIMDSumOfProducts: Simd {
#[cfg(neon_simd)]
unsafe fn simd_sum_of_products_muladd(scalar: Self,
mut src: *const Self,
mut dst: *mut Self,
mut count: usize) {
let scalarx = Self::simd_from_constant(scalar);
while count >= 4 * Self::LANES {
let a = Self::simd_load(src.add(0 * Self::LANES));
let b = Self::simd_load(src.add(1 * Self::LANES));
let c = Self::simd_load(src.add(2 * Self::LANES));
let d = Self::simd_load(src.add(3 * Self::LANES));
let a_dst = Self::simd_load(dst.add(0 * Self::LANES));
let b_dst = Self::simd_load(dst.add(1 * Self::LANES));
let c_dst = Self::simd_load(dst.add(2 * Self::LANES));
let d_dst = Self::simd_load(dst.add(3 * Self::LANES));
let a_out = Self::simd_muladd(a_dst, scalarx, a);
let b_out = Self::simd_muladd(b_dst, scalarx, b);
let c_out = Self::simd_muladd(c_dst, scalarx, c);
let d_out = Self::simd_muladd(d_dst, scalarx, d);
Self::simd_store(dst.add(0 * Self::LANES), a_out);
Self::simd_store(dst.add(1 * Self::LANES), b_out);
Self::simd_store(dst.add(2 * Self::LANES), c_out);
Self::simd_store(dst.add(3 * Self::LANES), d_out);
count -= 4 * Self::LANES;
dst = dst.add(4 * Self::LANES);
src = src.add(4 * Self::LANES);
}
while count >= Self::LANES {
let a = Self::simd_load(src);
let a_dst = Self::simd_load(dst);
let a_out = Self::simd_muladd(a_dst, scalarx, a);
Self::simd_store(dst, a_out);
count -= Self::LANES;
dst = dst.add(Self::LANES);
src = src.add(Self::LANES);
}
for _ in 0..count {
*dst = scalar.mul_add(*src, *dst);
dst = dst.add(1);
src = src.add(1);
}
}
#[cfg(neon_simd)]
unsafe fn simd_sum_of_scaled_array(scalar: Self,
mut src: *const Self,
dst: *mut Self,
mut count: usize) {
let mut sum = Self::simd_from_constant(Self::zero());
while count >= 4 * Self::LANES {
let a = Self::simd_load(src.add(0 * Self::LANES));
let b = Self::simd_load(src.add(1 * Self::LANES));
let c = Self::simd_load(src.add(2 * Self::LANES));
let d = Self::simd_load(src.add(3 * Self::LANES));
let ab = Self::simd_add(a, b);
let cd = Self::simd_add(c, d);
sum = Self::simd_add(sum, Self::simd_add(ab, cd));
count -= 4 * Self::LANES;
src = src.add(4 * Self::LANES);
}
while count >= Self::LANES {
let a = Self::simd_load(src);
sum = Self::simd_add(sum, a);
count -= Self::LANES;
src = src.add(Self::LANES);
}
let mut sum = Self::simd_horizontal_sum(sum);
for i in 0..count {
sum += *src.add(i);
}
*dst = scalar.mul_add(sum, *dst);
}
#[cfg(neon_simd)]
unsafe fn simd_dot_product(mut src0: *const Self,
mut src1: *const Self,
dst: *mut Self,
mut count: usize) {
let mut sum = Self::simd_from_constant(Self::zero());
while count >= 4 * Self::LANES {
let a0 = Self::simd_load(src0.add(0 * Self::LANES));
let b0 = Self::simd_load(src1.add(0 * Self::LANES));
let a1 = Self::simd_load(src0.add(1 * Self::LANES));
let b1 = Self::simd_load(src1.add(1 * Self::LANES));
let a2 = Self::simd_load(src0.add(2 * Self::LANES));
let b2 = Self::simd_load(src1.add(2 * Self::LANES));
let a3 = Self::simd_load(src0.add(3 * Self::LANES));
let b3 = Self::simd_load(src1.add(3 * Self::LANES));
let ab0 = Self::simd_mul(a0, b0);
let ab1 = Self::simd_mul(a1, b1);
let ab2 = Self::simd_mul(a2, b2);
let ab3 = Self::simd_mul(a3, b3);
let ab01 = Self::simd_add(ab0, ab1);
let ab23 = Self::simd_add(ab2, ab3);
let ab0123 = Self::simd_add(ab01, ab23);
sum = Self::simd_add(sum, ab0123);
count -= 4 * Self::LANES;
src0 = src0.add(4 * Self::LANES);
src1 = src1.add(4 * Self::LANES);
}
while count >= Self::LANES {
let a = Self::simd_load(src0);
let b = Self::simd_load(src1);
sum = Self::simd_muladd(sum, a, b);
count -= Self::LANES;
src0 = src0.add(Self::LANES);
src1 = src1.add(Self::LANES);
}
let mut sum = Self::simd_horizontal_sum(sum);
for _ in 0..count {
sum = (*src0).mul_add(*src1, sum);
src0 = src0.add(1);
src1 = src1.add(1);
}
*dst += sum;
}
}
impl<T: Simd> SIMDSumOfProducts for T {}