use core::arch::x86_64::*;
use crate::fft::{Complex32, butterflies::ops::complex_mul_sse4_2};
#[target_feature(enable = "sse4.2")]
pub(super) unsafe fn butterfly_radix2_stride1_sse4_2(
src: &[Complex32],
dst: &mut [Complex32],
stage_twiddles: &[Complex32],
) {
let samples = src.len();
let half_samples = samples >> 1;
let simd_iters = (half_samples >> 1) << 1;
unsafe {
for i in (0..simd_iters).step_by(2) {
let a_ptr = src.as_ptr().add(i) as *const f32;
let a = _mm_loadu_ps(a_ptr);
let b_ptr = src.as_ptr().add(i + half_samples) as *const f32;
let b = _mm_loadu_ps(b_ptr);
let out_top = _mm_add_ps(a, b);
let out_bot = _mm_sub_ps(a, b);
let out_top_pd = _mm_castps_pd(out_top);
let out_bot_pd = _mm_castps_pd(out_bot);
let interleaved_lo = _mm_castpd_ps(_mm_unpacklo_pd(out_top_pd, out_bot_pd));
let interleaved_hi = _mm_castpd_ps(_mm_unpackhi_pd(out_top_pd, out_bot_pd));
let j = i << 1;
let dst_ptr = dst.as_mut_ptr().add(j) as *mut f32;
_mm_storeu_ps(dst_ptr, interleaved_lo);
_mm_storeu_ps(dst_ptr.add(4), interleaved_hi);
}
}
super::butterfly_radix2_scalar::<2>(src, dst, stage_twiddles, 1, simd_iters);
}
#[target_feature(enable = "sse4.2")]
pub(super) unsafe fn butterfly_radix2_generic_sse4_2(
src: &[Complex32],
dst: &mut [Complex32],
stage_twiddles: &[Complex32],
stride: usize,
) {
if stride == 0 {
return;
}
let samples = src.len();
let half_samples = samples >> 1;
let simd_iters = (half_samples >> 1) << 1;
unsafe {
for i in (0..simd_iters).step_by(2) {
let k = i % stride;
let k0 = k;
let k1 = k + 1 - ((k + 1 >= stride) as usize) * stride;
let a_ptr = src.as_ptr().add(i) as *const f32;
let a = _mm_loadu_ps(a_ptr);
let b_ptr = src.as_ptr().add(i + half_samples) as *const f32;
let b = _mm_loadu_ps(b_ptr);
let tw_ptr = stage_twiddles.as_ptr().add(i) as *const f32;
let tw = _mm_loadu_ps(tw_ptr);
let t = complex_mul_sse4_2(tw, b);
let out_top = _mm_add_ps(a, t);
let out_bot = _mm_sub_ps(a, t);
let j0 = (i << 1) - k0;
let j1 = ((i + 1) << 1) - k1;
let out_top_pd = _mm_castps_pd(out_top);
let out_bot_pd = _mm_castps_pd(out_bot);
let dst_ptr = dst.as_mut_ptr() as *mut f64;
_mm_storel_pd(dst_ptr.add(j0), out_top_pd);
_mm_storel_pd(dst_ptr.add(j0 + stride), out_bot_pd);
_mm_storeh_pd(dst_ptr.add(j1), out_top_pd);
_mm_storeh_pd(dst_ptr.add(j1 + stride), out_bot_pd);
}
}
super::butterfly_radix2_scalar::<2>(src, dst, stage_twiddles, stride, simd_iters);
}