use core::f32;
use fearless_simd::{f32x16, f32x4, f32x8, f64x4, f64x8, Simd, SimdBase, SimdFloat, SimdFrom};
use num_traits::Float;
#[inline(always)] pub fn fft_dit_chunk_2<S: Simd, T: Float>(_simd: S, reals: &mut [T], imags: &mut [T]) {
reals
.chunks_exact_mut(2)
.zip(imags.chunks_exact_mut(2))
.for_each(|(reals_chunk, imags_chunk)| {
let z0_re = reals_chunk[0];
let z0_im = imags_chunk[0];
let z1_re = reals_chunk[1];
let z1_im = imags_chunk[1];
reals_chunk[0] = z0_re + z1_re;
imags_chunk[0] = z0_im + z1_im;
reals_chunk[1] = z0_re - z1_re;
imags_chunk[1] = z0_im - z1_im;
});
}
#[inline(never)] pub fn fft_dit_chunk_4_f64<S: Simd>(simd: S, reals: &mut [f64], imags: &mut [f64]) {
simd.vectorize(
#[inline(always)]
|| fft_dit_chunk_4_simd_f64(simd, reals, imags),
)
}
#[inline(always)] fn fft_dit_chunk_4_simd_f64<S: Simd>(_simd: S, reals: &mut [f64], imags: &mut [f64]) {
const DIST: usize = 2;
const CHUNK_SIZE: usize = DIST * 2;
let two = 2.0_f64;
(reals.as_chunks_mut::<CHUNK_SIZE>().0.iter_mut())
.zip(imags.as_chunks_mut::<CHUNK_SIZE>().0.iter_mut())
.for_each(|(reals_chunk, imags_chunk)| {
let (reals_s0, reals_s1) = reals_chunk.split_at_mut(DIST);
let (imags_s0, imags_s1) = imags_chunk.split_at_mut(DIST);
let in0_re = reals_s0[0];
let in1_re = reals_s1[0];
let in0_im = imags_s0[0];
let in1_im = imags_s1[0];
reals_s0[0] = in0_re + in1_re;
imags_s0[0] = in0_im + in1_im;
reals_s1[0] = in0_re.mul_add(two, -reals_s0[0]);
imags_s1[0] = in0_im.mul_add(two, -imags_s0[0]);
let in0_re = reals_s0[1];
let in1_re = reals_s1[1];
let in0_im = imags_s0[1];
let in1_im = imags_s1[1];
reals_s0[1] = in0_re + in1_im;
imags_s0[1] = in0_im - in1_re;
reals_s1[1] = in0_re.mul_add(two, -reals_s0[1]);
imags_s1[1] = in0_im.mul_add(two, -imags_s0[1]);
});
}
#[inline(never)] pub fn fft_dit_chunk_4_f32<S: Simd>(simd: S, reals: &mut [f32], imags: &mut [f32]) {
simd.vectorize(
#[inline(always)]
|| fft_dit_chunk_4_simd_f32(simd, reals, imags),
)
}
#[inline(always)] fn fft_dit_chunk_4_simd_f32<S: Simd>(_simd: S, reals: &mut [f32], imags: &mut [f32]) {
const DIST: usize = 2;
const CHUNK_SIZE: usize = DIST * 2;
let two = 2.0_f32;
(reals.as_chunks_mut::<CHUNK_SIZE>().0.iter_mut())
.zip(imags.as_chunks_mut::<CHUNK_SIZE>().0.iter_mut())
.for_each(|(reals_chunk, imags_chunk)| {
let (reals_s0, reals_s1) = reals_chunk.split_at_mut(DIST);
let (imags_s0, imags_s1) = imags_chunk.split_at_mut(DIST);
let in0_re = reals_s0[0];
let in1_re = reals_s1[0];
let in0_im = imags_s0[0];
let in1_im = imags_s1[0];
reals_s0[0] = in0_re + in1_re;
imags_s0[0] = in0_im + in1_im;
reals_s1[0] = in0_re.mul_add(two, -reals_s0[0]);
imags_s1[0] = in0_im.mul_add(two, -imags_s0[0]);
let in0_re = reals_s0[1];
let in1_re = reals_s1[1];
let in0_im = imags_s0[1];
let in1_im = imags_s1[1];
reals_s0[1] = in0_re + in1_im;
imags_s0[1] = in0_im - in1_re;
reals_s1[1] = in0_re.mul_add(two, -reals_s0[1]);
imags_s1[1] = in0_im.mul_add(two, -imags_s0[1]);
});
}
#[inline(never)] pub fn fft_dit_chunk_8_f64<S: Simd>(simd: S, reals: &mut [f64], imags: &mut [f64]) {
simd.vectorize(
#[inline(always)]
|| fft_dit_chunk_8_simd_f64(simd, reals, imags),
)
}
#[inline(always)] fn fft_dit_chunk_8_simd_f64<S: Simd>(simd: S, reals: &mut [f64], imags: &mut [f64]) {
const DIST: usize = 4;
const CHUNK_SIZE: usize = DIST * 2;
let two = f64x4::splat(simd, 2.0);
let sqrt2_2 = f64x4::simd_from(
simd,
[
1.0, std::f64::consts::FRAC_1_SQRT_2, 0.0, -std::f64::consts::FRAC_1_SQRT_2, ],
);
let sqrt2_2_im = f64x4::simd_from(
simd,
[
0.0, -std::f64::consts::FRAC_1_SQRT_2, -1.0, -std::f64::consts::FRAC_1_SQRT_2, ],
);
(reals.as_chunks_mut::<CHUNK_SIZE>().0.iter_mut())
.zip(imags.as_chunks_mut::<CHUNK_SIZE>().0.iter_mut())
.for_each(|(reals_chunk, imags_chunk)| {
let (reals_s0, reals_s1) = reals_chunk.split_at_mut(DIST);
let (imags_s0, imags_s1) = imags_chunk.split_at_mut(DIST);
let in0_re = f64x4::from_slice(simd, &reals_s0[0..4]);
let in1_re = f64x4::from_slice(simd, &reals_s1[0..4]);
let in0_im = f64x4::from_slice(simd, &imags_s0[0..4]);
let in1_im = f64x4::from_slice(simd, &imags_s1[0..4]);
let out0_re = sqrt2_2_im.mul_add(-in1_im, sqrt2_2.mul_add(in1_re, in0_re));
let out0_im = sqrt2_2_im.mul_add(in1_re, sqrt2_2.mul_add(in1_im, in0_im));
let out1_re = two.mul_sub(in0_re, out0_re);
let out1_im = two.mul_sub(in0_im, out0_im);
out0_re.store_slice(reals_s0);
out0_im.store_slice(imags_s0);
out1_re.store_slice(reals_s1);
out1_im.store_slice(imags_s1);
});
}
#[inline(never)] pub fn fft_dit_chunk_8_f32<S: Simd>(simd: S, reals: &mut [f32], imags: &mut [f32]) {
simd.vectorize(
#[inline(always)]
|| fft_dit_chunk_8_simd_f32(simd, reals, imags),
)
}
#[inline(always)] fn fft_dit_chunk_8_simd_f32<S: Simd>(simd: S, reals: &mut [f32], imags: &mut [f32]) {
const DIST: usize = 4;
const CHUNK_SIZE: usize = DIST * 2;
let two = f32x4::splat(simd, 2.0);
let sqrt2_2 = f32x4::simd_from(
simd,
[
1.0_f32, std::f32::consts::FRAC_1_SQRT_2, 0.0_f32, -std::f32::consts::FRAC_1_SQRT_2, ],
);
let sqrt2_2_im = f32x4::simd_from(
simd,
[
0.0_f32, -std::f32::consts::FRAC_1_SQRT_2, -1.0_f32, -std::f32::consts::FRAC_1_SQRT_2, ],
);
(reals.as_chunks_mut::<CHUNK_SIZE>().0.iter_mut())
.zip(imags.as_chunks_mut::<CHUNK_SIZE>().0.iter_mut())
.for_each(|(reals_chunk, imags_chunk)| {
let (reals_s0, reals_s1) = reals_chunk.split_at_mut(DIST);
let (imags_s0, imags_s1) = imags_chunk.split_at_mut(DIST);
let in0_re = f32x4::from_slice(simd, &reals_s0[0..4]);
let in1_re = f32x4::from_slice(simd, &reals_s1[0..4]);
let in0_im = f32x4::from_slice(simd, &imags_s0[0..4]);
let in1_im = f32x4::from_slice(simd, &imags_s1[0..4]);
let out0_re = sqrt2_2_im.mul_add(-in1_im, sqrt2_2.mul_add(in1_re, in0_re));
let out0_im = sqrt2_2_im.mul_add(in1_re, sqrt2_2.mul_add(in1_im, in0_im));
let out1_re = two.mul_sub(in0_re, out0_re);
let out1_im = two.mul_sub(in0_im, out0_im);
out0_re.store_slice(reals_s0);
out0_im.store_slice(imags_s0);
out1_re.store_slice(reals_s1);
out1_im.store_slice(imags_s1);
});
}
#[inline(never)] pub fn fft_dit_chunk_16_f64<S: Simd>(simd: S, reals: &mut [f64], imags: &mut [f64]) {
simd.vectorize(
#[inline(always)]
|| fft_dit_chunk_16_simd_f64(simd, reals, imags),
)
}
#[inline(always)] fn fft_dit_chunk_16_simd_f64<S: Simd>(simd: S, reals: &mut [f64], imags: &mut [f64]) {
const DIST: usize = 8;
const CHUNK_SIZE: usize = DIST * 2;
let two = f64x8::splat(simd, 2.0);
let twiddle_re = f64x8::simd_from(
simd,
[
1.0, 0.9238795325112867, std::f64::consts::FRAC_1_SQRT_2, 0.38268343236508984, 0.0, -0.38268343236508984, -std::f64::consts::FRAC_1_SQRT_2, -0.9238795325112867, ],
);
let twiddle_im = f64x8::simd_from(
simd,
[
0.0, -0.38268343236508984, -std::f64::consts::FRAC_1_SQRT_2, -0.9238795325112867, -1.0, -0.9238795325112867, -std::f64::consts::FRAC_1_SQRT_2, -0.38268343236508984, ],
);
(reals.as_chunks_mut::<CHUNK_SIZE>().0.iter_mut())
.zip(imags.as_chunks_mut::<CHUNK_SIZE>().0.iter_mut())
.for_each(|(reals_chunk, imags_chunk)| {
let (reals_s0, reals_s1) = reals_chunk.split_at_mut(DIST);
let (imags_s0, imags_s1) = imags_chunk.split_at_mut(DIST);
let in0_re = f64x8::from_slice(simd, &reals_s0[0..8]);
let in1_re = f64x8::from_slice(simd, &reals_s1[0..8]);
let in0_im = f64x8::from_slice(simd, &imags_s0[0..8]);
let in1_im = f64x8::from_slice(simd, &imags_s1[0..8]);
let out0_re = twiddle_im.mul_add(-in1_im, twiddle_re.mul_add(in1_re, in0_re));
let out0_im = twiddle_im.mul_add(in1_re, twiddle_re.mul_add(in1_im, in0_im));
let out1_re = two.mul_sub(in0_re, out0_re);
let out1_im = two.mul_sub(in0_im, out0_im);
out0_re.store_slice(reals_s0);
out0_im.store_slice(imags_s0);
out1_re.store_slice(reals_s1);
out1_im.store_slice(imags_s1);
});
}
#[inline(never)] pub fn fft_dit_chunk_16_f32<S: Simd>(simd: S, reals: &mut [f32], imags: &mut [f32]) {
simd.vectorize(
#[inline(always)]
|| fft_dit_chunk_16_simd_f32(simd, reals, imags),
)
}
#[inline(always)] fn fft_dit_chunk_16_simd_f32<S: Simd>(simd: S, reals: &mut [f32], imags: &mut [f32]) {
const DIST: usize = 8;
const CHUNK_SIZE: usize = DIST * 2;
let two = f32x8::splat(simd, 2.0);
let twiddle_re = f32x8::simd_from(
simd,
[
1.0_f32, 0.923_879_5_f32, std::f32::consts::FRAC_1_SQRT_2, 0.382_683_43_f32, 0.0_f32, -0.382_683_43_f32, -std::f32::consts::FRAC_1_SQRT_2, -0.923_879_5_f32, ],
);
let twiddle_im = f32x8::simd_from(
simd,
[
0.0_f32, -0.382_683_43_f32, -std::f32::consts::FRAC_1_SQRT_2, -0.923_879_5_f32, -1.0_f32, -0.923_879_5_f32, -std::f32::consts::FRAC_1_SQRT_2, -0.382_683_43_f32, ],
);
(reals.as_chunks_mut::<CHUNK_SIZE>().0.iter_mut())
.zip(imags.as_chunks_mut::<CHUNK_SIZE>().0.iter_mut())
.for_each(|(reals_chunk, imags_chunk)| {
let (reals_s0, reals_s1) = reals_chunk.split_at_mut(DIST);
let (imags_s0, imags_s1) = imags_chunk.split_at_mut(DIST);
let in0_re = f32x8::from_slice(simd, &reals_s0[0..8]);
let in1_re = f32x8::from_slice(simd, &reals_s1[0..8]);
let in0_im = f32x8::from_slice(simd, &imags_s0[0..8]);
let in1_im = f32x8::from_slice(simd, &imags_s1[0..8]);
let out0_re = twiddle_im.mul_add(-in1_im, twiddle_re.mul_add(in1_re, in0_re));
let out0_im = twiddle_im.mul_add(in1_re, twiddle_re.mul_add(in1_im, in0_im));
let out1_re = two.mul_sub(in0_re, out0_re);
let out1_im = two.mul_sub(in0_im, out0_im);
out0_re.store_slice(reals_s0);
out0_im.store_slice(imags_s0);
out1_re.store_slice(reals_s1);
out1_im.store_slice(imags_s1);
});
}
#[inline(never)] pub fn fft_dit_chunk_32_f64<S: Simd>(simd: S, reals: &mut [f64], imags: &mut [f64]) {
simd.vectorize(
#[inline(always)]
|| fft_dit_chunk_32_simd_f64(simd, reals, imags),
)
}
#[inline(always)] fn fft_dit_chunk_32_simd_f64<S: Simd>(simd: S, reals: &mut [f64], imags: &mut [f64]) {
const DIST: usize = 16;
const CHUNK_SIZE: usize = DIST * 2;
let two = f64x8::splat(simd, 2.0);
let twiddle_re_0_7 = f64x8::simd_from(
simd,
[
1.0, 0.9807852804032304, 0.9238795325112867, 0.8314696123025452, std::f64::consts::FRAC_1_SQRT_2, 0.5555702330196022, 0.3826834323650898, 0.19509032201612825, ],
);
let twiddle_im_0_7 = f64x8::simd_from(
simd,
[
0.0, -0.19509032201612825, -0.3826834323650898, -0.5555702330196022, -std::f64::consts::FRAC_1_SQRT_2, -0.8314696123025452, -0.9238795325112867, -0.9807852804032304, ],
);
let twiddle_re_8_15 = f64x8::simd_from(
simd,
[
0.0, -0.19509032201612825, -0.3826834323650898, -0.5555702330196022, -std::f64::consts::FRAC_1_SQRT_2, -0.8314696123025452, -0.9238795325112867, -0.9807852804032304, ],
);
let twiddle_im_8_15 = f64x8::simd_from(
simd,
[
-1.0, -0.9807852804032304, -0.9238795325112867, -0.8314696123025452, -std::f64::consts::FRAC_1_SQRT_2, -0.5555702330196022, -0.3826834323650898, -0.19509032201612825, ],
);
(reals.as_chunks_mut::<CHUNK_SIZE>().0.iter_mut())
.zip(imags.as_chunks_mut::<CHUNK_SIZE>().0.iter_mut())
.for_each(|(reals_chunk, imags_chunk)| {
let (reals_s0, reals_s1) = reals_chunk.split_at_mut(DIST);
let (imags_s0, imags_s1) = imags_chunk.split_at_mut(DIST);
let in0_re_0_7 = f64x8::from_slice(simd, &reals_s0[0..8]);
let in1_re_0_7 = f64x8::from_slice(simd, &reals_s1[0..8]);
let in0_im_0_7 = f64x8::from_slice(simd, &imags_s0[0..8]);
let in1_im_0_7 = f64x8::from_slice(simd, &imags_s1[0..8]);
let out0_re_0_7 =
twiddle_im_0_7.mul_add(-in1_im_0_7, twiddle_re_0_7.mul_add(in1_re_0_7, in0_re_0_7));
let out0_im_0_7 =
twiddle_im_0_7.mul_add(in1_re_0_7, twiddle_re_0_7.mul_add(in1_im_0_7, in0_im_0_7));
let out1_re_0_7 = two.mul_sub(in0_re_0_7, out0_re_0_7);
let out1_im_0_7 = two.mul_sub(in0_im_0_7, out0_im_0_7);
out0_re_0_7.store_slice(&mut reals_s0[0..8]);
out0_im_0_7.store_slice(&mut imags_s0[0..8]);
out1_re_0_7.store_slice(&mut reals_s1[0..8]);
out1_im_0_7.store_slice(&mut imags_s1[0..8]);
let in0_re_8_15 = f64x8::from_slice(simd, &reals_s0[8..16]);
let in1_re_8_15 = f64x8::from_slice(simd, &reals_s1[8..16]);
let in0_im_8_15 = f64x8::from_slice(simd, &imags_s0[8..16]);
let in1_im_8_15 = f64x8::from_slice(simd, &imags_s1[8..16]);
let out0_re_8_15 = twiddle_im_8_15.mul_add(
-in1_im_8_15,
twiddle_re_8_15.mul_add(in1_re_8_15, in0_re_8_15),
);
let out0_im_8_15 = twiddle_im_8_15.mul_add(
in1_re_8_15,
twiddle_re_8_15.mul_add(in1_im_8_15, in0_im_8_15),
);
let out1_re_8_15 = two.mul_sub(in0_re_8_15, out0_re_8_15);
let out1_im_8_15 = two.mul_sub(in0_im_8_15, out0_im_8_15);
out0_re_8_15.store_slice(&mut reals_s0[8..16]);
out0_im_8_15.store_slice(&mut imags_s0[8..16]);
out1_re_8_15.store_slice(&mut reals_s1[8..16]);
out1_im_8_15.store_slice(&mut imags_s1[8..16]);
});
}
#[inline(never)] pub fn fft_dit_chunk_32_f32<S: Simd>(simd: S, reals: &mut [f32], imags: &mut [f32]) {
simd.vectorize(
#[inline(always)]
|| fft_dit_chunk_32_simd_f32(simd, reals, imags),
)
}
#[inline(always)] fn fft_dit_chunk_32_simd_f32<S: Simd>(simd: S, reals: &mut [f32], imags: &mut [f32]) {
const DIST: usize = 16;
const CHUNK_SIZE: usize = DIST * 2;
let two = f32x16::splat(simd, 2.0);
let twiddle_re = f32x16::simd_from(
simd,
[
1.0_f32, 0.980_785_25_f32, 0.923_879_5_f32, 0.831_469_6_f32, std::f32::consts::FRAC_1_SQRT_2, 0.555_570_24_f32, 0.382_683_43_f32, 0.195_090_32_f32, 0.0_f32, -0.195_090_32_f32, -0.382_683_43_f32, -0.555_570_24_f32, -f32::consts::FRAC_1_SQRT_2, -0.831_469_6_f32, -0.923_879_5_f32, -0.980_785_25_f32, ],
);
let twiddle_im = f32x16::simd_from(
simd,
[
0.0_f32, -0.195_090_32_f32, -0.382_683_43_f32, -0.555_570_24_f32, -std::f32::consts::FRAC_1_SQRT_2, -0.831_469_6_f32, -0.923_879_5_f32, -0.980_785_25_f32, -1.0_f32, -0.980_785_25_f32, -0.923_879_5_f32, -0.831_469_6_f32, -std::f32::consts::FRAC_1_SQRT_2, -0.555_570_24_f32, -0.382_683_43_f32, -0.195_090_32_f32, ],
);
(reals.as_chunks_mut::<CHUNK_SIZE>().0.iter_mut())
.zip(imags.as_chunks_mut::<CHUNK_SIZE>().0.iter_mut())
.for_each(|(reals_chunk, imags_chunk)| {
let (reals_s0, reals_s1) = reals_chunk.split_at_mut(DIST);
let (imags_s0, imags_s1) = imags_chunk.split_at_mut(DIST);
let in0_re = f32x16::from_slice(simd, &reals_s0[0..16]);
let in1_re = f32x16::from_slice(simd, &reals_s1[0..16]);
let in0_im = f32x16::from_slice(simd, &imags_s0[0..16]);
let in1_im = f32x16::from_slice(simd, &imags_s1[0..16]);
let out0_re = twiddle_im.mul_add(-in1_im, twiddle_re.mul_add(in1_re, in0_re));
let out0_im = twiddle_im.mul_add(in1_re, twiddle_re.mul_add(in1_im, in0_im));
let out1_re = two.mul_sub(in0_re, out0_re);
let out1_im = two.mul_sub(in0_im, out0_im);
out0_re.store_slice(reals_s0);
out0_im.store_slice(imags_s0);
out1_re.store_slice(reals_s1);
out1_im.store_slice(imags_s1);
});
}
#[inline(never)] pub fn fft_dit_chunk_64_f64<S: Simd>(simd: S, reals: &mut [f64], imags: &mut [f64]) {
simd.vectorize(
#[inline(always)]
|| fft_dit_chunk_64_simd_f64(simd, reals, imags),
)
}
#[inline(always)] fn fft_dit_chunk_64_simd_f64<S: Simd>(simd: S, reals: &mut [f64], imags: &mut [f64]) {
const DIST: usize = 32;
const CHUNK_SIZE: usize = DIST * 2;
let two = f64x8::splat(simd, 2.0);
let twiddle_re_0_7 = f64x8::simd_from(
simd,
[
1.0, 0.9951847266721969, 0.9807852804032304, 0.9569403357322089, 0.9238795325112867, 0.8819212643483549, 0.8314696123025452, 0.773010453362737, ],
);
let twiddle_im_0_7 = f64x8::simd_from(
simd,
[
0.0, -0.0980171403295606, -0.19509032201612825, -0.29028467725446233, -0.3826834323650898, -0.47139673682599764, -0.5555702330196022, -0.6343932841636455, ],
);
let twiddle_re_8_15 = f64x8::simd_from(
simd,
[
std::f64::consts::FRAC_1_SQRT_2, 0.6343932841636455, 0.5555702330196022, 0.47139673682599764, 0.3826834323650898, 0.29028467725446233, 0.19509032201612825, 0.0980171403295606, ],
);
let twiddle_im_8_15 = f64x8::simd_from(
simd,
[
-std::f64::consts::FRAC_1_SQRT_2, -0.773010453362737, -0.8314696123025452, -0.8819212643483549, -0.9238795325112867, -0.9569403357322089, -0.9807852804032304, -0.9951847266721969, ],
);
let twiddle_re_16_23 = f64x8::simd_from(
simd,
[
0.0, -0.0980171403295606, -0.19509032201612825, -0.29028467725446233, -0.3826834323650898, -0.47139673682599764, -0.5555702330196022, -0.6343932841636455, ],
);
let twiddle_im_16_23 = f64x8::simd_from(
simd,
[
-1.0, -0.9951847266721969, -0.9807852804032304, -0.9569403357322089, -0.9238795325112867, -0.8819212643483549, -0.8314696123025452, -0.773010453362737, ],
);
let twiddle_re_24_31 = f64x8::simd_from(
simd,
[
-std::f64::consts::FRAC_1_SQRT_2, -0.773010453362737, -0.8314696123025452, -0.8819212643483549, -0.9238795325112867, -0.9569403357322089, -0.9807852804032304, -0.9951847266721969, ],
);
let twiddle_im_24_31 = f64x8::simd_from(
simd,
[
-std::f64::consts::FRAC_1_SQRT_2, -0.6343932841636455, -0.5555702330196022, -0.47139673682599764, -0.3826834323650898, -0.29028467725446233, -0.19509032201612825, -0.0980171403295606, ],
);
(reals.as_chunks_mut::<CHUNK_SIZE>().0.iter_mut())
.zip(imags.as_chunks_mut::<CHUNK_SIZE>().0.iter_mut())
.for_each(|(reals_chunk, imags_chunk)| {
let (reals_s0, reals_s1) = reals_chunk.split_at_mut(DIST);
let (imags_s0, imags_s1) = imags_chunk.split_at_mut(DIST);
let in0_re = f64x8::from_slice(simd, &reals_s0[0..8]);
let in1_re = f64x8::from_slice(simd, &reals_s1[0..8]);
let in0_im = f64x8::from_slice(simd, &imags_s0[0..8]);
let in1_im = f64x8::from_slice(simd, &imags_s1[0..8]);
let out0_re = twiddle_im_0_7.mul_add(-in1_im, twiddle_re_0_7.mul_add(in1_re, in0_re));
let out0_im = twiddle_im_0_7.mul_add(in1_re, twiddle_re_0_7.mul_add(in1_im, in0_im));
let out1_re = two.mul_sub(in0_re, out0_re);
let out1_im = two.mul_sub(in0_im, out0_im);
out0_re.store_slice(&mut reals_s0[0..8]);
out0_im.store_slice(&mut imags_s0[0..8]);
out1_re.store_slice(&mut reals_s1[0..8]);
out1_im.store_slice(&mut imags_s1[0..8]);
let in0_re = f64x8::from_slice(simd, &reals_s0[8..16]);
let in1_re = f64x8::from_slice(simd, &reals_s1[8..16]);
let in0_im = f64x8::from_slice(simd, &imags_s0[8..16]);
let in1_im = f64x8::from_slice(simd, &imags_s1[8..16]);
let out0_re = twiddle_im_8_15.mul_add(-in1_im, twiddle_re_8_15.mul_add(in1_re, in0_re));
let out0_im = twiddle_im_8_15.mul_add(in1_re, twiddle_re_8_15.mul_add(in1_im, in0_im));
let out1_re = two.mul_sub(in0_re, out0_re);
let out1_im = two.mul_sub(in0_im, out0_im);
out0_re.store_slice(&mut reals_s0[8..16]);
out0_im.store_slice(&mut imags_s0[8..16]);
out1_re.store_slice(&mut reals_s1[8..16]);
out1_im.store_slice(&mut imags_s1[8..16]);
let in0_re = f64x8::from_slice(simd, &reals_s0[16..24]);
let in1_re = f64x8::from_slice(simd, &reals_s1[16..24]);
let in0_im = f64x8::from_slice(simd, &imags_s0[16..24]);
let in1_im = f64x8::from_slice(simd, &imags_s1[16..24]);
let out0_re =
twiddle_im_16_23.mul_add(-in1_im, twiddle_re_16_23.mul_add(in1_re, in0_re));
let out0_im =
twiddle_im_16_23.mul_add(in1_re, twiddle_re_16_23.mul_add(in1_im, in0_im));
let out1_re = two.mul_sub(in0_re, out0_re);
let out1_im = two.mul_sub(in0_im, out0_im);
out0_re.store_slice(&mut reals_s0[16..24]);
out0_im.store_slice(&mut imags_s0[16..24]);
out1_re.store_slice(&mut reals_s1[16..24]);
out1_im.store_slice(&mut imags_s1[16..24]);
let in0_re = f64x8::from_slice(simd, &reals_s0[24..32]);
let in1_re = f64x8::from_slice(simd, &reals_s1[24..32]);
let in0_im = f64x8::from_slice(simd, &imags_s0[24..32]);
let in1_im = f64x8::from_slice(simd, &imags_s1[24..32]);
let out0_re =
twiddle_im_24_31.mul_add(-in1_im, twiddle_re_24_31.mul_add(in1_re, in0_re));
let out0_im =
twiddle_im_24_31.mul_add(in1_re, twiddle_re_24_31.mul_add(in1_im, in0_im));
let out1_re = two.mul_sub(in0_re, out0_re);
let out1_im = two.mul_sub(in0_im, out0_im);
out0_re.store_slice(&mut reals_s0[24..32]);
out0_im.store_slice(&mut imags_s0[24..32]);
out1_re.store_slice(&mut reals_s1[24..32]);
out1_im.store_slice(&mut imags_s1[24..32]);
});
}
#[inline(never)] pub fn fft_dit_chunk_64_f32<S: Simd>(simd: S, reals: &mut [f32], imags: &mut [f32]) {
simd.vectorize(
#[inline(always)]
|| fft_dit_chunk_64_simd_f32(simd, reals, imags),
)
}
#[inline(always)] fn fft_dit_chunk_64_simd_f32<S: Simd>(simd: S, reals: &mut [f32], imags: &mut [f32]) {
const DIST: usize = 32;
const CHUNK_SIZE: usize = DIST * 2;
let two = f32x16::splat(simd, 2.0);
let twiddle_re_0_15 = f32x16::simd_from(
simd,
[
1.0_f32, 0.995_184_7_f32, 0.980_785_25_f32, 0.956_940_35_f32, 0.923_879_5_f32, 0.881_921_3_f32, 0.831_469_6_f32, 0.773_010_43_f32, std::f32::consts::FRAC_1_SQRT_2, 0.634_393_3_f32, 0.555_570_24_f32, 0.471_396_74_f32, 0.382_683_43_f32, 0.290_284_66_f32, 0.195_090_32_f32, 0.098_017_14_f32, ],
);
let twiddle_im_0_15 = f32x16::simd_from(
simd,
[
0.0_f32, -0.098_017_14_f32, -0.195_090_32_f32, -0.290_284_66_f32, -0.382_683_43_f32, -0.471_396_74_f32, -0.555_570_24_f32, -0.634_393_3_f32, -std::f32::consts::FRAC_1_SQRT_2, -0.773_010_43_f32, -0.831_469_6_f32, -0.881_921_3_f32, -0.923_879_5_f32, -0.956_940_35_f32, -0.980_785_25_f32, -0.995_184_7_f32, ],
);
let twiddle_re_16_31 = f32x16::simd_from(
simd,
[
0.0_f32, -0.098_017_14_f32, -0.195_090_32_f32, -0.290_284_66_f32, -0.382_683_43_f32, -0.471_396_74_f32, -0.555_570_24_f32, -0.634_393_3_f32, -std::f32::consts::FRAC_1_SQRT_2, -0.773_010_43_f32, -0.831_469_6_f32, -0.881_921_3_f32, -0.923_879_5_f32, -0.956_940_35_f32, -0.980_785_25_f32, -0.995_184_7_f32, ],
);
let twiddle_im_16_31 = f32x16::simd_from(
simd,
[
-1.0_f32, -0.995_184_7_f32, -0.980_785_25_f32, -0.956_940_35_f32, -0.923_879_5_f32, -0.881_921_3_f32, -0.831_469_6_f32, -0.773_010_43_f32, -std::f32::consts::FRAC_1_SQRT_2, -0.634_393_3_f32, -0.555_570_24_f32, -0.471_396_74_f32, -0.382_683_43_f32, -0.290_284_66_f32, -0.195_090_32_f32, -0.098_017_14_f32, ],
);
(reals.as_chunks_mut::<CHUNK_SIZE>().0.iter_mut())
.zip(imags.as_chunks_mut::<CHUNK_SIZE>().0.iter_mut())
.for_each(|(reals_chunk, imags_chunk)| {
let (reals_s0, reals_s1) = reals_chunk.split_at_mut(DIST);
let (imags_s0, imags_s1) = imags_chunk.split_at_mut(DIST);
let in0_re = f32x16::from_slice(simd, &reals_s0[0..16]);
let in1_re = f32x16::from_slice(simd, &reals_s1[0..16]);
let in0_im = f32x16::from_slice(simd, &imags_s0[0..16]);
let in1_im = f32x16::from_slice(simd, &imags_s1[0..16]);
let out0_re = twiddle_im_0_15.mul_add(-in1_im, twiddle_re_0_15.mul_add(in1_re, in0_re));
let out0_im = twiddle_im_0_15.mul_add(in1_re, twiddle_re_0_15.mul_add(in1_im, in0_im));
let out1_re = two.mul_sub(in0_re, out0_re);
let out1_im = two.mul_sub(in0_im, out0_im);
out0_re.store_slice(&mut reals_s0[0..16]);
out0_im.store_slice(&mut imags_s0[0..16]);
out1_re.store_slice(&mut reals_s1[0..16]);
out1_im.store_slice(&mut imags_s1[0..16]);
let in0_re = f32x16::from_slice(simd, &reals_s0[16..32]);
let in1_re = f32x16::from_slice(simd, &reals_s1[16..32]);
let in0_im = f32x16::from_slice(simd, &imags_s0[16..32]);
let in1_im = f32x16::from_slice(simd, &imags_s1[16..32]);
let out0_re =
twiddle_im_16_31.mul_add(-in1_im, twiddle_re_16_31.mul_add(in1_re, in0_re));
let out0_im =
twiddle_im_16_31.mul_add(in1_re, twiddle_re_16_31.mul_add(in1_im, in0_im));
let out1_re = two.mul_sub(in0_re, out0_re);
let out1_im = two.mul_sub(in0_im, out0_im);
out0_re.store_slice(&mut reals_s0[16..32]);
out0_im.store_slice(&mut imags_s0[16..32]);
out1_re.store_slice(&mut reals_s1[16..32]);
out1_im.store_slice(&mut imags_s1[16..32]);
});
}
#[inline(never)] pub fn fft_dit_chunk_n_f64<S: Simd>(
simd: S,
reals: &mut [f64],
imags: &mut [f64],
twiddles_re: &[f64],
twiddles_im: &[f64],
dist: usize,
) {
simd.vectorize(
#[inline(always)]
|| fft_dit_chunk_n_simd_f64(simd, reals, imags, twiddles_re, twiddles_im, dist),
)
}
#[inline(always)] fn fft_dit_chunk_n_simd_f64<S: Simd>(
simd: S,
reals: &mut [f64],
imags: &mut [f64],
twiddles_re: &[f64],
twiddles_im: &[f64],
dist: usize,
) {
const LANES: usize = 8;
let chunk_size = dist * 2;
assert!(chunk_size >= LANES * 2);
for (reals_chunk, imags_chunk) in reals
.chunks_exact_mut(chunk_size)
.zip(imags.chunks_exact_mut(chunk_size))
{
let (reals_s0, reals_s1) = reals_chunk.split_at_mut(dist);
let (imags_s0, imags_s1) = imags_chunk.split_at_mut(dist);
(reals_s0.as_chunks_mut::<LANES>().0.iter_mut())
.zip(reals_s1.as_chunks_mut::<LANES>().0.iter_mut())
.zip(imags_s0.as_chunks_mut::<LANES>().0.iter_mut())
.zip(imags_s1.as_chunks_mut::<LANES>().0.iter_mut())
.zip(twiddles_re.as_chunks::<LANES>().0.iter())
.zip(twiddles_im.as_chunks::<LANES>().0.iter())
.for_each(|(((((re_s0, re_s1), im_s0), im_s1), tw_re), tw_im)| {
let two = f64x8::splat(simd, 2.0);
let in0_re = f64x8::simd_from(simd, *re_s0);
let in1_re = f64x8::simd_from(simd, *re_s1);
let in0_im = f64x8::simd_from(simd, *im_s0);
let in1_im = f64x8::simd_from(simd, *im_s1);
let tw_re = f64x8::simd_from(simd, *tw_re);
let tw_im = f64x8::simd_from(simd, *tw_im);
let out0_re = tw_im.mul_add(-in1_im, tw_re.mul_add(in1_re, in0_re));
let out0_im = tw_im.mul_add(in1_re, tw_re.mul_add(in1_im, in0_im));
let out1_re = two.mul_sub(in0_re, out0_re);
let out1_im = two.mul_sub(in0_im, out0_im);
out0_re.store_slice(re_s0);
out0_im.store_slice(im_s0);
out1_re.store_slice(re_s1);
out1_im.store_slice(im_s1);
});
}
}
#[inline(never)] pub fn fft_dit_chunk_n_f32<S: Simd>(
simd: S,
reals: &mut [f32],
imags: &mut [f32],
twiddles_re: &[f32],
twiddles_im: &[f32],
dist: usize,
) {
simd.vectorize(
#[inline(always)]
|| fft_dit_chunk_n_simd_f32(simd, reals, imags, twiddles_re, twiddles_im, dist),
)
}
#[inline(always)] fn fft_dit_chunk_n_simd_f32<S: Simd>(
simd: S,
reals: &mut [f32],
imags: &mut [f32],
twiddles_re: &[f32],
twiddles_im: &[f32],
dist: usize,
) {
const LANES: usize = 16;
let chunk_size = dist * 2;
assert!(chunk_size >= LANES * 2);
for (reals_chunk, imags_chunk) in reals
.chunks_exact_mut(chunk_size)
.zip(imags.chunks_exact_mut(chunk_size))
{
let (reals_s0, reals_s1) = reals_chunk.split_at_mut(dist);
let (imags_s0, imags_s1) = imags_chunk.split_at_mut(dist);
(reals_s0.as_chunks_mut::<LANES>().0.iter_mut())
.zip(reals_s1.as_chunks_mut::<LANES>().0.iter_mut())
.zip(imags_s0.as_chunks_mut::<LANES>().0.iter_mut())
.zip(imags_s1.as_chunks_mut::<LANES>().0.iter_mut())
.zip(twiddles_re.as_chunks::<LANES>().0.iter())
.zip(twiddles_im.as_chunks::<LANES>().0.iter())
.for_each(|(((((re_s0, re_s1), im_s0), im_s1), tw_re), tw_im)| {
let two = f32x16::splat(simd, 2.0);
let in0_re = f32x16::simd_from(simd, *re_s0);
let in1_re = f32x16::simd_from(simd, *re_s1);
let in0_im = f32x16::simd_from(simd, *im_s0);
let in1_im = f32x16::simd_from(simd, *im_s1);
let tw_re = f32x16::simd_from(simd, *tw_re);
let tw_im = f32x16::simd_from(simd, *tw_im);
let out0_re = tw_im.mul_add(-in1_im, tw_re.mul_add(in1_re, in0_re));
let out0_im = tw_im.mul_add(in1_re, tw_re.mul_add(in1_im, in0_im));
let out1_re = two.mul_sub(in0_re, out0_re);
let out1_im = two.mul_sub(in0_im, out0_im);
out0_re.store_slice(re_s0);
out0_im.store_slice(im_s0);
out1_re.store_slice(re_s1);
out1_im.store_slice(im_s1);
});
}
}