#[target_feature(enable = "sse2")]
pub(crate) unsafe fn convolve_interp_sse2(
input: &[f32],
coeffs1: &[f32],
coeffs2: &[f32],
frac: f32,
taps: usize,
) -> f32 {
use core::arch::x86_64::*;
unsafe {
const SIMD_WIDTH: usize = 4;
let simd_iterations = taps / SIMD_WIDTH;
assert_eq!(coeffs1.as_ptr().addr() % 64, 0);
assert_eq!(coeffs2.as_ptr().addr() % 64, 0);
let mut acc1 = _mm_setzero_ps();
let mut acc2 = _mm_setzero_ps();
for i in 0..simd_iterations {
let offset = i * SIMD_WIDTH;
let input_vec = _mm_loadu_ps(input.as_ptr().add(offset));
let coeffs_vec1 = _mm_load_ps(coeffs1.as_ptr().add(offset));
let coeffs_vec2 = _mm_load_ps(coeffs2.as_ptr().add(offset));
acc1 = _mm_add_ps(acc1, _mm_mul_ps(coeffs_vec1, input_vec));
acc2 = _mm_add_ps(acc2, _mm_mul_ps(coeffs_vec2, input_vec));
}
let frac_vec = _mm_set1_ps(frac);
let one_minus_frac = _mm_set1_ps(1.0 - frac);
let weighted1 = _mm_mul_ps(acc1, one_minus_frac);
let weighted2 = _mm_mul_ps(acc2, frac_vec);
let interpolated = _mm_add_ps(weighted1, weighted2);
let shuf = _mm_shuffle_ps(interpolated, interpolated, 0b01_00_11_10);
let sum1 = _mm_add_ps(interpolated, shuf);
let shuf2 = _mm_shuffle_ps(sum1, sum1, 0b00_00_00_01);
let sum2 = _mm_add_ps(sum1, shuf2);
_mm_cvtss_f32(sum2)
}
}