use core::arch::x86_64::{__m128, __m128d};
use std::any::TypeId;
use std::sync::Arc;
use num_complex::Complex;
use crate::{common::FftNum, FftDirection};
use crate::array_utils::DoubleBuf;
use crate::twiddles;
use crate::{Direction, Fft, Length};
use super::sse_common::{assert_f32, assert_f64};
use super::sse_utils::*;
use super::sse_vector::*;
pub const fn prime_butterfly_lens() -> &'static [usize] {
&[7, 11, 13, 17, 19, 23, 29, 31, ]
}
#[target_feature(enable = "sse4.1")]
pub unsafe fn construct_prime_butterfly<T: FftNum>(len: usize, direction: FftDirection) -> Arc<dyn Fft<T>> {
let id_f32 = TypeId::of::<f32>();
let id_f64 = TypeId::of::<f64>();
let id_t = TypeId::of::<T>();
if id_t == id_f32 {
match len {
7 => Arc::new(SseF32Butterfly7::new(direction)) as Arc<dyn Fft<T>>,
11 => Arc::new(SseF32Butterfly11::new(direction)) as Arc<dyn Fft<T>>,
13 => Arc::new(SseF32Butterfly13::new(direction)) as Arc<dyn Fft<T>>,
17 => Arc::new(SseF32Butterfly17::new(direction)) as Arc<dyn Fft<T>>,
19 => Arc::new(SseF32Butterfly19::new(direction)) as Arc<dyn Fft<T>>,
23 => Arc::new(SseF32Butterfly23::new(direction)) as Arc<dyn Fft<T>>,
29 => Arc::new(SseF32Butterfly29::new(direction)) as Arc<dyn Fft<T>>,
31 => Arc::new(SseF32Butterfly31::new(direction)) as Arc<dyn Fft<T>>,
_ => unimplemented!("Invalid SSE prime butterfly length: {len}"),
}
} else if id_t == id_f64 {
match len {
7 => Arc::new(SseF64Butterfly7::new(direction)) as Arc<dyn Fft<T>>,
11 => Arc::new(SseF64Butterfly11::new(direction)) as Arc<dyn Fft<T>>,
13 => Arc::new(SseF64Butterfly13::new(direction)) as Arc<dyn Fft<T>>,
17 => Arc::new(SseF64Butterfly17::new(direction)) as Arc<dyn Fft<T>>,
19 => Arc::new(SseF64Butterfly19::new(direction)) as Arc<dyn Fft<T>>,
23 => Arc::new(SseF64Butterfly23::new(direction)) as Arc<dyn Fft<T>>,
29 => Arc::new(SseF64Butterfly29::new(direction)) as Arc<dyn Fft<T>>,
31 => Arc::new(SseF64Butterfly31::new(direction)) as Arc<dyn Fft<T>>,
_ => unimplemented!("Invalid SSE prime butterfly length: {len}"),
}
} else {
unimplemented!("Not f32 or f64");
}
}
#[inline(always)]
fn make_twiddles<const TW: usize, T: FftNum>(len: usize, direction: FftDirection) -> [Complex<T>; TW] {
let mut i = 1;
[(); TW].map(|_| {
let twiddle = twiddles::compute_twiddle(i, len, direction);
i += 1;
twiddle
})
}
struct SseF32Butterfly7<T> {
direction: FftDirection,
twiddles_re: [__m128; 3],
twiddles_im: [__m128; 3],
_phantom: std::marker::PhantomData<T>,
}
boilerplate_fft_sse_f32_butterfly!(SseF32Butterfly7, 7, |this: &SseF32Butterfly7<_>| this.direction);
impl<T: FftNum> SseF32Butterfly7<T> {
#[target_feature(enable = "sse4.1")]
unsafe fn new(direction: FftDirection) -> Self {
assert_f32::<T>();
let twiddles = make_twiddles(7, direction);
Self {
direction,
twiddles_re: twiddles.map(|t| SseVector::broadcast_scalar(t.re)),
twiddles_im: twiddles.map(|t| SseVector::broadcast_scalar(t.im)),
_phantom: std::marker::PhantomData,
}
}
#[inline(always)]
pub(crate) unsafe fn perform_fft_contiguous(&self, mut buffer: impl SseArrayMut<f32>) {
let values = read_partial1_complex_to_array!(buffer, { 0,1,2,3,4,5,6 });
let out = self.perform_parallel_fft_direct(values);
write_partial_lo_complex_to_array!(out, buffer, { 0,1,2,3,4,5,6 } );
}
#[inline(always)]
pub(crate) unsafe fn perform_parallel_fft_contiguous(&self, mut buffer: impl SseArrayMut<f32>) {
let input_packed = read_complex_to_array!(buffer, { 0,2,4,6,8,10,12 });
let values = [
extract_lo_hi_f32(input_packed[0], input_packed[3]),
extract_hi_lo_f32(input_packed[0], input_packed[4]),
extract_lo_hi_f32(input_packed[1], input_packed[4]),
extract_hi_lo_f32(input_packed[1], input_packed[5]),
extract_lo_hi_f32(input_packed[2], input_packed[5]),
extract_hi_lo_f32(input_packed[2], input_packed[6]),
extract_lo_hi_f32(input_packed[3], input_packed[6]),
];
let out = self.perform_parallel_fft_direct(values);
let out_packed = [
extract_lo_lo_f32(out[0], out[1]),
extract_lo_lo_f32(out[2], out[3]),
extract_lo_lo_f32(out[4], out[5]),
extract_lo_hi_f32(out[6], out[0]),
extract_hi_hi_f32(out[1], out[2]),
extract_hi_hi_f32(out[3], out[4]),
extract_hi_hi_f32(out[5], out[6]),
];
write_complex_to_array_strided!(out_packed, buffer, 2, { 0,1,2,3,4,5,6 });
}
#[inline(always)]
pub(crate) unsafe fn perform_parallel_fft_direct(&self, values: [__m128; 7]) -> [__m128; 7] {
let rotate = SseVector::make_rotate90(FftDirection::Inverse);
let y00 = values[0];
let [x1p6, x1m6] = SseVector::column_butterfly2([values[1], values[6]]);
let x1m6 = SseVector::apply_rotate90(rotate, x1m6);
let y00 = SseVector::add(y00, x1p6);
let [x2p5, x2m5] = SseVector::column_butterfly2([values[2], values[5]]);
let x2m5 = SseVector::apply_rotate90(rotate, x2m5);
let y00 = SseVector::add(y00, x2p5);
let [x3p4, x3m4] = SseVector::column_butterfly2([values[3], values[4]]);
let x3m4 = SseVector::apply_rotate90(rotate, x3m4);
let y00 = SseVector::add(y00, x3p4);
let m0106a = SseVector::fmadd(values[0], self.twiddles_re[0], x1p6);
let m0106a = SseVector::fmadd(m0106a, self.twiddles_re[1], x2p5);
let m0106a = SseVector::fmadd(m0106a, self.twiddles_re[2], x3p4);
let m0106b = SseVector::mul(self.twiddles_im[0], x1m6);
let m0106b = SseVector::fmadd(m0106b, self.twiddles_im[1], x2m5);
let m0106b = SseVector::fmadd(m0106b, self.twiddles_im[2], x3m4);
let [y01, y06] = SseVector::column_butterfly2([m0106a, m0106b]);
let m0205a = SseVector::fmadd(values[0], self.twiddles_re[1], x1p6);
let m0205a = SseVector::fmadd(m0205a, self.twiddles_re[2], x2p5);
let m0205a = SseVector::fmadd(m0205a, self.twiddles_re[0], x3p4);
let m0205b = SseVector::mul(self.twiddles_im[1], x1m6);
let m0205b = SseVector::nmadd(m0205b, self.twiddles_im[2], x2m5);
let m0205b = SseVector::nmadd(m0205b, self.twiddles_im[0], x3m4);
let [y02, y05] = SseVector::column_butterfly2([m0205a, m0205b]);
let m0304a = SseVector::fmadd(values[0], self.twiddles_re[2], x1p6);
let m0304a = SseVector::fmadd(m0304a, self.twiddles_re[0], x2p5);
let m0304a = SseVector::fmadd(m0304a, self.twiddles_re[1], x3p4);
let m0304b = SseVector::mul(self.twiddles_im[2], x1m6);
let m0304b = SseVector::nmadd(m0304b, self.twiddles_im[0], x2m5);
let m0304b = SseVector::fmadd(m0304b, self.twiddles_im[1], x3m4);
let [y03, y04] = SseVector::column_butterfly2([m0304a, m0304b]);
[y00, y01, y02, y03, y04, y05, y06]
}
}
struct SseF64Butterfly7<T> {
direction: FftDirection,
twiddles_re: [__m128d; 3],
twiddles_im: [__m128d; 3],
_phantom: std::marker::PhantomData<T>,
}
boilerplate_fft_sse_f64_butterfly!(SseF64Butterfly7, 7, |this: &SseF64Butterfly7<_>| this.direction);
impl<T: FftNum> SseF64Butterfly7<T> {
#[target_feature(enable = "sse4.1")]
unsafe fn new(direction: FftDirection) -> Self {
assert_f64::<T>();
let twiddles = make_twiddles(7, direction);
unsafe {Self {
direction,
twiddles_re: twiddles.map(|t| SseVector::broadcast_scalar(t.re)),
twiddles_im: twiddles.map(|t| SseVector::broadcast_scalar(t.im)),
_phantom: std::marker::PhantomData,
}}
}
#[inline(always)]
pub(crate) unsafe fn perform_fft_contiguous(&self, mut buffer: impl SseArrayMut<f64>) {
let values = read_complex_to_array!(buffer, { 0,1,2,3,4,5,6 });
let out = self.perform_fft_direct(values);
write_complex_to_array!(out, buffer, { 0,1,2,3,4,5,6 });
}
#[inline(always)]
pub(crate) unsafe fn perform_fft_direct(&self, values: [__m128d; 7]) -> [__m128d; 7] {
let rotate = SseVector::make_rotate90(FftDirection::Inverse);
let y00 = values[0];
let [x1p6, x1m6] = SseVector::column_butterfly2([values[1], values[6]]);
let x1m6 = SseVector::apply_rotate90(rotate, x1m6);
let y00 = SseVector::add(y00, x1p6);
let [x2p5, x2m5] = SseVector::column_butterfly2([values[2], values[5]]);
let x2m5 = SseVector::apply_rotate90(rotate, x2m5);
let y00 = SseVector::add(y00, x2p5);
let [x3p4, x3m4] = SseVector::column_butterfly2([values[3], values[4]]);
let x3m4 = SseVector::apply_rotate90(rotate, x3m4);
let y00 = SseVector::add(y00, x3p4);
let m0106a = SseVector::fmadd(values[0], self.twiddles_re[0], x1p6);
let m0106a = SseVector::fmadd(m0106a, self.twiddles_re[1], x2p5);
let m0106a = SseVector::fmadd(m0106a, self.twiddles_re[2], x3p4);
let m0106b = SseVector::mul(self.twiddles_im[0], x1m6);
let m0106b = SseVector::fmadd(m0106b, self.twiddles_im[1], x2m5);
let m0106b = SseVector::fmadd(m0106b, self.twiddles_im[2], x3m4);
let [y01, y06] = SseVector::column_butterfly2([m0106a, m0106b]);
let m0205a = SseVector::fmadd(values[0], self.twiddles_re[1], x1p6);
let m0205a = SseVector::fmadd(m0205a, self.twiddles_re[2], x2p5);
let m0205a = SseVector::fmadd(m0205a, self.twiddles_re[0], x3p4);
let m0205b = SseVector::mul(self.twiddles_im[1], x1m6);
let m0205b = SseVector::nmadd(m0205b, self.twiddles_im[2], x2m5);
let m0205b = SseVector::nmadd(m0205b, self.twiddles_im[0], x3m4);
let [y02, y05] = SseVector::column_butterfly2([m0205a, m0205b]);
let m0304a = SseVector::fmadd(values[0], self.twiddles_re[2], x1p6);
let m0304a = SseVector::fmadd(m0304a, self.twiddles_re[0], x2p5);
let m0304a = SseVector::fmadd(m0304a, self.twiddles_re[1], x3p4);
let m0304b = SseVector::mul(self.twiddles_im[2], x1m6);
let m0304b = SseVector::nmadd(m0304b, self.twiddles_im[0], x2m5);
let m0304b = SseVector::fmadd(m0304b, self.twiddles_im[1], x3m4);
let [y03, y04] = SseVector::column_butterfly2([m0304a, m0304b]);
[y00, y01, y02, y03, y04, y05, y06]
}
}
struct SseF32Butterfly11<T> {
direction: FftDirection,
twiddles_re: [__m128; 5],
twiddles_im: [__m128; 5],
_phantom: std::marker::PhantomData<T>,
}
boilerplate_fft_sse_f32_butterfly!(SseF32Butterfly11, 11, |this: &SseF32Butterfly11<_>| this.direction);
impl<T: FftNum> SseF32Butterfly11<T> {
#[target_feature(enable = "sse4.1")]
unsafe fn new(direction: FftDirection) -> Self {
assert_f32::<T>();
let twiddles = make_twiddles(11, direction);
Self {
direction,
twiddles_re: twiddles.map(|t| SseVector::broadcast_scalar(t.re)),
twiddles_im: twiddles.map(|t| SseVector::broadcast_scalar(t.im)),
_phantom: std::marker::PhantomData,
}
}
#[inline(always)]
pub(crate) unsafe fn perform_fft_contiguous(&self, mut buffer: impl SseArrayMut<f32>) {
let values = read_partial1_complex_to_array!(buffer, { 0,1,2,3,4,5,6,7,8,9,10 });
let out = self.perform_parallel_fft_direct(values);
write_partial_lo_complex_to_array!(out, buffer, { 0,1,2,3,4,5,6,7,8,9,10 } );
}
#[inline(always)]
pub(crate) unsafe fn perform_parallel_fft_contiguous(&self, mut buffer: impl SseArrayMut<f32>) {
let input_packed = read_complex_to_array!(buffer, { 0,2,4,6,8,10,12,14,16,18,20 });
let values = [
extract_lo_hi_f32(input_packed[0], input_packed[5]),
extract_hi_lo_f32(input_packed[0], input_packed[6]),
extract_lo_hi_f32(input_packed[1], input_packed[6]),
extract_hi_lo_f32(input_packed[1], input_packed[7]),
extract_lo_hi_f32(input_packed[2], input_packed[7]),
extract_hi_lo_f32(input_packed[2], input_packed[8]),
extract_lo_hi_f32(input_packed[3], input_packed[8]),
extract_hi_lo_f32(input_packed[3], input_packed[9]),
extract_lo_hi_f32(input_packed[4], input_packed[9]),
extract_hi_lo_f32(input_packed[4], input_packed[10]),
extract_lo_hi_f32(input_packed[5], input_packed[10]),
];
let out = self.perform_parallel_fft_direct(values);
let out_packed = [
extract_lo_lo_f32(out[0], out[1]),
extract_lo_lo_f32(out[2], out[3]),
extract_lo_lo_f32(out[4], out[5]),
extract_lo_lo_f32(out[6], out[7]),
extract_lo_lo_f32(out[8], out[9]),
extract_lo_hi_f32(out[10], out[0]),
extract_hi_hi_f32(out[1], out[2]),
extract_hi_hi_f32(out[3], out[4]),
extract_hi_hi_f32(out[5], out[6]),
extract_hi_hi_f32(out[7], out[8]),
extract_hi_hi_f32(out[9], out[10]),
];
write_complex_to_array_strided!(out_packed, buffer, 2, { 0,1,2,3,4,5,6,7,8,9,10 });
}
#[inline(always)]
pub(crate) unsafe fn perform_parallel_fft_direct(&self, values: [__m128; 11]) -> [__m128; 11] {
let rotate = SseVector::make_rotate90(FftDirection::Inverse);
let y00 = values[0];
let [x1p10, x1m10] = SseVector::column_butterfly2([values[1], values[10]]);
let x1m10 = SseVector::apply_rotate90(rotate, x1m10);
let y00 = SseVector::add(y00, x1p10);
let [x2p9, x2m9] = SseVector::column_butterfly2([values[2], values[9]]);
let x2m9 = SseVector::apply_rotate90(rotate, x2m9);
let y00 = SseVector::add(y00, x2p9);
let [x3p8, x3m8] = SseVector::column_butterfly2([values[3], values[8]]);
let x3m8 = SseVector::apply_rotate90(rotate, x3m8);
let y00 = SseVector::add(y00, x3p8);
let [x4p7, x4m7] = SseVector::column_butterfly2([values[4], values[7]]);
let x4m7 = SseVector::apply_rotate90(rotate, x4m7);
let y00 = SseVector::add(y00, x4p7);
let [x5p6, x5m6] = SseVector::column_butterfly2([values[5], values[6]]);
let x5m6 = SseVector::apply_rotate90(rotate, x5m6);
let y00 = SseVector::add(y00, x5p6);
let m0110a = SseVector::fmadd(values[0], self.twiddles_re[0], x1p10);
let m0110a = SseVector::fmadd(m0110a, self.twiddles_re[1], x2p9);
let m0110a = SseVector::fmadd(m0110a, self.twiddles_re[2], x3p8);
let m0110a = SseVector::fmadd(m0110a, self.twiddles_re[3], x4p7);
let m0110a = SseVector::fmadd(m0110a, self.twiddles_re[4], x5p6);
let m0110b = SseVector::mul(self.twiddles_im[0], x1m10);
let m0110b = SseVector::fmadd(m0110b, self.twiddles_im[1], x2m9);
let m0110b = SseVector::fmadd(m0110b, self.twiddles_im[2], x3m8);
let m0110b = SseVector::fmadd(m0110b, self.twiddles_im[3], x4m7);
let m0110b = SseVector::fmadd(m0110b, self.twiddles_im[4], x5m6);
let [y01, y10] = SseVector::column_butterfly2([m0110a, m0110b]);
let m0209a = SseVector::fmadd(values[0], self.twiddles_re[1], x1p10);
let m0209a = SseVector::fmadd(m0209a, self.twiddles_re[3], x2p9);
let m0209a = SseVector::fmadd(m0209a, self.twiddles_re[4], x3p8);
let m0209a = SseVector::fmadd(m0209a, self.twiddles_re[2], x4p7);
let m0209a = SseVector::fmadd(m0209a, self.twiddles_re[0], x5p6);
let m0209b = SseVector::mul(self.twiddles_im[1], x1m10);
let m0209b = SseVector::fmadd(m0209b, self.twiddles_im[3], x2m9);
let m0209b = SseVector::nmadd(m0209b, self.twiddles_im[4], x3m8);
let m0209b = SseVector::nmadd(m0209b, self.twiddles_im[2], x4m7);
let m0209b = SseVector::nmadd(m0209b, self.twiddles_im[0], x5m6);
let [y02, y09] = SseVector::column_butterfly2([m0209a, m0209b]);
let m0308a = SseVector::fmadd(values[0], self.twiddles_re[2], x1p10);
let m0308a = SseVector::fmadd(m0308a, self.twiddles_re[4], x2p9);
let m0308a = SseVector::fmadd(m0308a, self.twiddles_re[1], x3p8);
let m0308a = SseVector::fmadd(m0308a, self.twiddles_re[0], x4p7);
let m0308a = SseVector::fmadd(m0308a, self.twiddles_re[3], x5p6);
let m0308b = SseVector::mul(self.twiddles_im[2], x1m10);
let m0308b = SseVector::nmadd(m0308b, self.twiddles_im[4], x2m9);
let m0308b = SseVector::nmadd(m0308b, self.twiddles_im[1], x3m8);
let m0308b = SseVector::fmadd(m0308b, self.twiddles_im[0], x4m7);
let m0308b = SseVector::fmadd(m0308b, self.twiddles_im[3], x5m6);
let [y03, y08] = SseVector::column_butterfly2([m0308a, m0308b]);
let m0407a = SseVector::fmadd(values[0], self.twiddles_re[3], x1p10);
let m0407a = SseVector::fmadd(m0407a, self.twiddles_re[2], x2p9);
let m0407a = SseVector::fmadd(m0407a, self.twiddles_re[0], x3p8);
let m0407a = SseVector::fmadd(m0407a, self.twiddles_re[4], x4p7);
let m0407a = SseVector::fmadd(m0407a, self.twiddles_re[1], x5p6);
let m0407b = SseVector::mul(self.twiddles_im[3], x1m10);
let m0407b = SseVector::nmadd(m0407b, self.twiddles_im[2], x2m9);
let m0407b = SseVector::fmadd(m0407b, self.twiddles_im[0], x3m8);
let m0407b = SseVector::fmadd(m0407b, self.twiddles_im[4], x4m7);
let m0407b = SseVector::nmadd(m0407b, self.twiddles_im[1], x5m6);
let [y04, y07] = SseVector::column_butterfly2([m0407a, m0407b]);
let m0506a = SseVector::fmadd(values[0], self.twiddles_re[4], x1p10);
let m0506a = SseVector::fmadd(m0506a, self.twiddles_re[0], x2p9);
let m0506a = SseVector::fmadd(m0506a, self.twiddles_re[3], x3p8);
let m0506a = SseVector::fmadd(m0506a, self.twiddles_re[1], x4p7);
let m0506a = SseVector::fmadd(m0506a, self.twiddles_re[2], x5p6);
let m0506b = SseVector::mul(self.twiddles_im[4], x1m10);
let m0506b = SseVector::nmadd(m0506b, self.twiddles_im[0], x2m9);
let m0506b = SseVector::fmadd(m0506b, self.twiddles_im[3], x3m8);
let m0506b = SseVector::nmadd(m0506b, self.twiddles_im[1], x4m7);
let m0506b = SseVector::fmadd(m0506b, self.twiddles_im[2], x5m6);
let [y05, y06] = SseVector::column_butterfly2([m0506a, m0506b]);
[y00, y01, y02, y03, y04, y05, y06, y07, y08, y09, y10]
}
}
struct SseF64Butterfly11<T> {
direction: FftDirection,
twiddles_re: [__m128d; 5],
twiddles_im: [__m128d; 5],
_phantom: std::marker::PhantomData<T>,
}
boilerplate_fft_sse_f64_butterfly!(SseF64Butterfly11, 11, |this: &SseF64Butterfly11<_>| this.direction);
impl<T: FftNum> SseF64Butterfly11<T> {
#[target_feature(enable = "sse4.1")]
unsafe fn new(direction: FftDirection) -> Self {
assert_f64::<T>();
let twiddles = make_twiddles(11, direction);
unsafe {Self {
direction,
twiddles_re: twiddles.map(|t| SseVector::broadcast_scalar(t.re)),
twiddles_im: twiddles.map(|t| SseVector::broadcast_scalar(t.im)),
_phantom: std::marker::PhantomData,
}}
}
#[inline(always)]
pub(crate) unsafe fn perform_fft_contiguous(&self, mut buffer: impl SseArrayMut<f64>) {
let values = read_complex_to_array!(buffer, { 0,1,2,3,4,5,6,7,8,9,10 });
let out = self.perform_fft_direct(values);
write_complex_to_array!(out, buffer, { 0,1,2,3,4,5,6,7,8,9,10 });
}
#[inline(always)]
pub(crate) unsafe fn perform_fft_direct(&self, values: [__m128d; 11]) -> [__m128d; 11] {
let rotate = SseVector::make_rotate90(FftDirection::Inverse);
let y00 = values[0];
let [x1p10, x1m10] = SseVector::column_butterfly2([values[1], values[10]]);
let x1m10 = SseVector::apply_rotate90(rotate, x1m10);
let y00 = SseVector::add(y00, x1p10);
let [x2p9, x2m9] = SseVector::column_butterfly2([values[2], values[9]]);
let x2m9 = SseVector::apply_rotate90(rotate, x2m9);
let y00 = SseVector::add(y00, x2p9);
let [x3p8, x3m8] = SseVector::column_butterfly2([values[3], values[8]]);
let x3m8 = SseVector::apply_rotate90(rotate, x3m8);
let y00 = SseVector::add(y00, x3p8);
let [x4p7, x4m7] = SseVector::column_butterfly2([values[4], values[7]]);
let x4m7 = SseVector::apply_rotate90(rotate, x4m7);
let y00 = SseVector::add(y00, x4p7);
let [x5p6, x5m6] = SseVector::column_butterfly2([values[5], values[6]]);
let x5m6 = SseVector::apply_rotate90(rotate, x5m6);
let y00 = SseVector::add(y00, x5p6);
let m0110a = SseVector::fmadd(values[0], self.twiddles_re[0], x1p10);
let m0110a = SseVector::fmadd(m0110a, self.twiddles_re[1], x2p9);
let m0110a = SseVector::fmadd(m0110a, self.twiddles_re[2], x3p8);
let m0110a = SseVector::fmadd(m0110a, self.twiddles_re[3], x4p7);
let m0110a = SseVector::fmadd(m0110a, self.twiddles_re[4], x5p6);
let m0110b = SseVector::mul(self.twiddles_im[0], x1m10);
let m0110b = SseVector::fmadd(m0110b, self.twiddles_im[1], x2m9);
let m0110b = SseVector::fmadd(m0110b, self.twiddles_im[2], x3m8);
let m0110b = SseVector::fmadd(m0110b, self.twiddles_im[3], x4m7);
let m0110b = SseVector::fmadd(m0110b, self.twiddles_im[4], x5m6);
let [y01, y10] = SseVector::column_butterfly2([m0110a, m0110b]);
let m0209a = SseVector::fmadd(values[0], self.twiddles_re[1], x1p10);
let m0209a = SseVector::fmadd(m0209a, self.twiddles_re[3], x2p9);
let m0209a = SseVector::fmadd(m0209a, self.twiddles_re[4], x3p8);
let m0209a = SseVector::fmadd(m0209a, self.twiddles_re[2], x4p7);
let m0209a = SseVector::fmadd(m0209a, self.twiddles_re[0], x5p6);
let m0209b = SseVector::mul(self.twiddles_im[1], x1m10);
let m0209b = SseVector::fmadd(m0209b, self.twiddles_im[3], x2m9);
let m0209b = SseVector::nmadd(m0209b, self.twiddles_im[4], x3m8);
let m0209b = SseVector::nmadd(m0209b, self.twiddles_im[2], x4m7);
let m0209b = SseVector::nmadd(m0209b, self.twiddles_im[0], x5m6);
let [y02, y09] = SseVector::column_butterfly2([m0209a, m0209b]);
let m0308a = SseVector::fmadd(values[0], self.twiddles_re[2], x1p10);
let m0308a = SseVector::fmadd(m0308a, self.twiddles_re[4], x2p9);
let m0308a = SseVector::fmadd(m0308a, self.twiddles_re[1], x3p8);
let m0308a = SseVector::fmadd(m0308a, self.twiddles_re[0], x4p7);
let m0308a = SseVector::fmadd(m0308a, self.twiddles_re[3], x5p6);
let m0308b = SseVector::mul(self.twiddles_im[2], x1m10);
let m0308b = SseVector::nmadd(m0308b, self.twiddles_im[4], x2m9);
let m0308b = SseVector::nmadd(m0308b, self.twiddles_im[1], x3m8);
let m0308b = SseVector::fmadd(m0308b, self.twiddles_im[0], x4m7);
let m0308b = SseVector::fmadd(m0308b, self.twiddles_im[3], x5m6);
let [y03, y08] = SseVector::column_butterfly2([m0308a, m0308b]);
let m0407a = SseVector::fmadd(values[0], self.twiddles_re[3], x1p10);
let m0407a = SseVector::fmadd(m0407a, self.twiddles_re[2], x2p9);
let m0407a = SseVector::fmadd(m0407a, self.twiddles_re[0], x3p8);
let m0407a = SseVector::fmadd(m0407a, self.twiddles_re[4], x4p7);
let m0407a = SseVector::fmadd(m0407a, self.twiddles_re[1], x5p6);
let m0407b = SseVector::mul(self.twiddles_im[3], x1m10);
let m0407b = SseVector::nmadd(m0407b, self.twiddles_im[2], x2m9);
let m0407b = SseVector::fmadd(m0407b, self.twiddles_im[0], x3m8);
let m0407b = SseVector::fmadd(m0407b, self.twiddles_im[4], x4m7);
let m0407b = SseVector::nmadd(m0407b, self.twiddles_im[1], x5m6);
let [y04, y07] = SseVector::column_butterfly2([m0407a, m0407b]);
let m0506a = SseVector::fmadd(values[0], self.twiddles_re[4], x1p10);
let m0506a = SseVector::fmadd(m0506a, self.twiddles_re[0], x2p9);
let m0506a = SseVector::fmadd(m0506a, self.twiddles_re[3], x3p8);
let m0506a = SseVector::fmadd(m0506a, self.twiddles_re[1], x4p7);
let m0506a = SseVector::fmadd(m0506a, self.twiddles_re[2], x5p6);
let m0506b = SseVector::mul(self.twiddles_im[4], x1m10);
let m0506b = SseVector::nmadd(m0506b, self.twiddles_im[0], x2m9);
let m0506b = SseVector::fmadd(m0506b, self.twiddles_im[3], x3m8);
let m0506b = SseVector::nmadd(m0506b, self.twiddles_im[1], x4m7);
let m0506b = SseVector::fmadd(m0506b, self.twiddles_im[2], x5m6);
let [y05, y06] = SseVector::column_butterfly2([m0506a, m0506b]);
[y00, y01, y02, y03, y04, y05, y06, y07, y08, y09, y10]
}
}
struct SseF32Butterfly13<T> {
direction: FftDirection,
twiddles_re: [__m128; 6],
twiddles_im: [__m128; 6],
_phantom: std::marker::PhantomData<T>,
}
boilerplate_fft_sse_f32_butterfly!(SseF32Butterfly13, 13, |this: &SseF32Butterfly13<_>| this.direction);
impl<T: FftNum> SseF32Butterfly13<T> {
#[target_feature(enable = "sse4.1")]
unsafe fn new(direction: FftDirection) -> Self {
assert_f32::<T>();
let twiddles = make_twiddles(13, direction);
Self {
direction,
twiddles_re: twiddles.map(|t| SseVector::broadcast_scalar(t.re)),
twiddles_im: twiddles.map(|t| SseVector::broadcast_scalar(t.im)),
_phantom: std::marker::PhantomData,
}
}
#[inline(always)]
pub(crate) unsafe fn perform_fft_contiguous(&self, mut buffer: impl SseArrayMut<f32>) {
let values = read_partial1_complex_to_array!(buffer, { 0,1,2,3,4,5,6,7,8,9,10,11,12 });
let out = self.perform_parallel_fft_direct(values);
write_partial_lo_complex_to_array!(out, buffer, { 0,1,2,3,4,5,6,7,8,9,10,11,12 } );
}
#[inline(always)]
pub(crate) unsafe fn perform_parallel_fft_contiguous(&self, mut buffer: impl SseArrayMut<f32>) {
let input_packed = read_complex_to_array!(buffer, { 0,2,4,6,8,10,12,14,16,18,20,22,24 });
let values = [
extract_lo_hi_f32(input_packed[0], input_packed[6]),
extract_hi_lo_f32(input_packed[0], input_packed[7]),
extract_lo_hi_f32(input_packed[1], input_packed[7]),
extract_hi_lo_f32(input_packed[1], input_packed[8]),
extract_lo_hi_f32(input_packed[2], input_packed[8]),
extract_hi_lo_f32(input_packed[2], input_packed[9]),
extract_lo_hi_f32(input_packed[3], input_packed[9]),
extract_hi_lo_f32(input_packed[3], input_packed[10]),
extract_lo_hi_f32(input_packed[4], input_packed[10]),
extract_hi_lo_f32(input_packed[4], input_packed[11]),
extract_lo_hi_f32(input_packed[5], input_packed[11]),
extract_hi_lo_f32(input_packed[5], input_packed[12]),
extract_lo_hi_f32(input_packed[6], input_packed[12]),
];
let out = self.perform_parallel_fft_direct(values);
let out_packed = [
extract_lo_lo_f32(out[0], out[1]),
extract_lo_lo_f32(out[2], out[3]),
extract_lo_lo_f32(out[4], out[5]),
extract_lo_lo_f32(out[6], out[7]),
extract_lo_lo_f32(out[8], out[9]),
extract_lo_lo_f32(out[10], out[11]),
extract_lo_hi_f32(out[12], out[0]),
extract_hi_hi_f32(out[1], out[2]),
extract_hi_hi_f32(out[3], out[4]),
extract_hi_hi_f32(out[5], out[6]),
extract_hi_hi_f32(out[7], out[8]),
extract_hi_hi_f32(out[9], out[10]),
extract_hi_hi_f32(out[11], out[12]),
];
write_complex_to_array_strided!(out_packed, buffer, 2, { 0,1,2,3,4,5,6,7,8,9,10,11,12 });
}
#[inline(always)]
pub(crate) unsafe fn perform_parallel_fft_direct(&self, values: [__m128; 13]) -> [__m128; 13] {
let rotate = SseVector::make_rotate90(FftDirection::Inverse);
let y00 = values[0];
let [x1p12, x1m12] = SseVector::column_butterfly2([values[1], values[12]]);
let x1m12 = SseVector::apply_rotate90(rotate, x1m12);
let y00 = SseVector::add(y00, x1p12);
let [x2p11, x2m11] = SseVector::column_butterfly2([values[2], values[11]]);
let x2m11 = SseVector::apply_rotate90(rotate, x2m11);
let y00 = SseVector::add(y00, x2p11);
let [x3p10, x3m10] = SseVector::column_butterfly2([values[3], values[10]]);
let x3m10 = SseVector::apply_rotate90(rotate, x3m10);
let y00 = SseVector::add(y00, x3p10);
let [x4p9, x4m9] = SseVector::column_butterfly2([values[4], values[9]]);
let x4m9 = SseVector::apply_rotate90(rotate, x4m9);
let y00 = SseVector::add(y00, x4p9);
let [x5p8, x5m8] = SseVector::column_butterfly2([values[5], values[8]]);
let x5m8 = SseVector::apply_rotate90(rotate, x5m8);
let y00 = SseVector::add(y00, x5p8);
let [x6p7, x6m7] = SseVector::column_butterfly2([values[6], values[7]]);
let x6m7 = SseVector::apply_rotate90(rotate, x6m7);
let y00 = SseVector::add(y00, x6p7);
let m0112a = SseVector::fmadd(values[0], self.twiddles_re[0], x1p12);
let m0112a = SseVector::fmadd(m0112a, self.twiddles_re[1], x2p11);
let m0112a = SseVector::fmadd(m0112a, self.twiddles_re[2], x3p10);
let m0112a = SseVector::fmadd(m0112a, self.twiddles_re[3], x4p9);
let m0112a = SseVector::fmadd(m0112a, self.twiddles_re[4], x5p8);
let m0112a = SseVector::fmadd(m0112a, self.twiddles_re[5], x6p7);
let m0112b = SseVector::mul(self.twiddles_im[0], x1m12);
let m0112b = SseVector::fmadd(m0112b, self.twiddles_im[1], x2m11);
let m0112b = SseVector::fmadd(m0112b, self.twiddles_im[2], x3m10);
let m0112b = SseVector::fmadd(m0112b, self.twiddles_im[3], x4m9);
let m0112b = SseVector::fmadd(m0112b, self.twiddles_im[4], x5m8);
let m0112b = SseVector::fmadd(m0112b, self.twiddles_im[5], x6m7);
let [y01, y12] = SseVector::column_butterfly2([m0112a, m0112b]);
let m0211a = SseVector::fmadd(values[0], self.twiddles_re[1], x1p12);
let m0211a = SseVector::fmadd(m0211a, self.twiddles_re[3], x2p11);
let m0211a = SseVector::fmadd(m0211a, self.twiddles_re[5], x3p10);
let m0211a = SseVector::fmadd(m0211a, self.twiddles_re[4], x4p9);
let m0211a = SseVector::fmadd(m0211a, self.twiddles_re[2], x5p8);
let m0211a = SseVector::fmadd(m0211a, self.twiddles_re[0], x6p7);
let m0211b = SseVector::mul(self.twiddles_im[1], x1m12);
let m0211b = SseVector::fmadd(m0211b, self.twiddles_im[3], x2m11);
let m0211b = SseVector::fmadd(m0211b, self.twiddles_im[5], x3m10);
let m0211b = SseVector::nmadd(m0211b, self.twiddles_im[4], x4m9);
let m0211b = SseVector::nmadd(m0211b, self.twiddles_im[2], x5m8);
let m0211b = SseVector::nmadd(m0211b, self.twiddles_im[0], x6m7);
let [y02, y11] = SseVector::column_butterfly2([m0211a, m0211b]);
let m0310a = SseVector::fmadd(values[0], self.twiddles_re[2], x1p12);
let m0310a = SseVector::fmadd(m0310a, self.twiddles_re[5], x2p11);
let m0310a = SseVector::fmadd(m0310a, self.twiddles_re[3], x3p10);
let m0310a = SseVector::fmadd(m0310a, self.twiddles_re[0], x4p9);
let m0310a = SseVector::fmadd(m0310a, self.twiddles_re[1], x5p8);
let m0310a = SseVector::fmadd(m0310a, self.twiddles_re[4], x6p7);
let m0310b = SseVector::mul(self.twiddles_im[2], x1m12);
let m0310b = SseVector::fmadd(m0310b, self.twiddles_im[5], x2m11);
let m0310b = SseVector::nmadd(m0310b, self.twiddles_im[3], x3m10);
let m0310b = SseVector::nmadd(m0310b, self.twiddles_im[0], x4m9);
let m0310b = SseVector::fmadd(m0310b, self.twiddles_im[1], x5m8);
let m0310b = SseVector::fmadd(m0310b, self.twiddles_im[4], x6m7);
let [y03, y10] = SseVector::column_butterfly2([m0310a, m0310b]);
let m0409a = SseVector::fmadd(values[0], self.twiddles_re[3], x1p12);
let m0409a = SseVector::fmadd(m0409a, self.twiddles_re[4], x2p11);
let m0409a = SseVector::fmadd(m0409a, self.twiddles_re[0], x3p10);
let m0409a = SseVector::fmadd(m0409a, self.twiddles_re[2], x4p9);
let m0409a = SseVector::fmadd(m0409a, self.twiddles_re[5], x5p8);
let m0409a = SseVector::fmadd(m0409a, self.twiddles_re[1], x6p7);
let m0409b = SseVector::mul(self.twiddles_im[3], x1m12);
let m0409b = SseVector::nmadd(m0409b, self.twiddles_im[4], x2m11);
let m0409b = SseVector::nmadd(m0409b, self.twiddles_im[0], x3m10);
let m0409b = SseVector::fmadd(m0409b, self.twiddles_im[2], x4m9);
let m0409b = SseVector::nmadd(m0409b, self.twiddles_im[5], x5m8);
let m0409b = SseVector::nmadd(m0409b, self.twiddles_im[1], x6m7);
let [y04, y09] = SseVector::column_butterfly2([m0409a, m0409b]);
let m0508a = SseVector::fmadd(values[0], self.twiddles_re[4], x1p12);
let m0508a = SseVector::fmadd(m0508a, self.twiddles_re[2], x2p11);
let m0508a = SseVector::fmadd(m0508a, self.twiddles_re[1], x3p10);
let m0508a = SseVector::fmadd(m0508a, self.twiddles_re[5], x4p9);
let m0508a = SseVector::fmadd(m0508a, self.twiddles_re[0], x5p8);
let m0508a = SseVector::fmadd(m0508a, self.twiddles_re[3], x6p7);
let m0508b = SseVector::mul(self.twiddles_im[4], x1m12);
let m0508b = SseVector::nmadd(m0508b, self.twiddles_im[2], x2m11);
let m0508b = SseVector::fmadd(m0508b, self.twiddles_im[1], x3m10);
let m0508b = SseVector::nmadd(m0508b, self.twiddles_im[5], x4m9);
let m0508b = SseVector::nmadd(m0508b, self.twiddles_im[0], x5m8);
let m0508b = SseVector::fmadd(m0508b, self.twiddles_im[3], x6m7);
let [y05, y08] = SseVector::column_butterfly2([m0508a, m0508b]);
let m0607a = SseVector::fmadd(values[0], self.twiddles_re[5], x1p12);
let m0607a = SseVector::fmadd(m0607a, self.twiddles_re[0], x2p11);
let m0607a = SseVector::fmadd(m0607a, self.twiddles_re[4], x3p10);
let m0607a = SseVector::fmadd(m0607a, self.twiddles_re[1], x4p9);
let m0607a = SseVector::fmadd(m0607a, self.twiddles_re[3], x5p8);
let m0607a = SseVector::fmadd(m0607a, self.twiddles_re[2], x6p7);
let m0607b = SseVector::mul(self.twiddles_im[5], x1m12);
let m0607b = SseVector::nmadd(m0607b, self.twiddles_im[0], x2m11);
let m0607b = SseVector::fmadd(m0607b, self.twiddles_im[4], x3m10);
let m0607b = SseVector::nmadd(m0607b, self.twiddles_im[1], x4m9);
let m0607b = SseVector::fmadd(m0607b, self.twiddles_im[3], x5m8);
let m0607b = SseVector::nmadd(m0607b, self.twiddles_im[2], x6m7);
let [y06, y07] = SseVector::column_butterfly2([m0607a, m0607b]);
[y00, y01, y02, y03, y04, y05, y06, y07, y08, y09, y10, y11, y12]
}
}
struct SseF64Butterfly13<T> {
direction: FftDirection,
twiddles_re: [__m128d; 6],
twiddles_im: [__m128d; 6],
_phantom: std::marker::PhantomData<T>,
}
boilerplate_fft_sse_f64_butterfly!(SseF64Butterfly13, 13, |this: &SseF64Butterfly13<_>| this.direction);
impl<T: FftNum> SseF64Butterfly13<T> {
#[target_feature(enable = "sse4.1")]
unsafe fn new(direction: FftDirection) -> Self {
assert_f64::<T>();
let twiddles = make_twiddles(13, direction);
unsafe {Self {
direction,
twiddles_re: twiddles.map(|t| SseVector::broadcast_scalar(t.re)),
twiddles_im: twiddles.map(|t| SseVector::broadcast_scalar(t.im)),
_phantom: std::marker::PhantomData,
}}
}
#[inline(always)]
pub(crate) unsafe fn perform_fft_contiguous(&self, mut buffer: impl SseArrayMut<f64>) {
let values = read_complex_to_array!(buffer, { 0,1,2,3,4,5,6,7,8,9,10,11,12 });
let out = self.perform_fft_direct(values);
write_complex_to_array!(out, buffer, { 0,1,2,3,4,5,6,7,8,9,10,11,12 });
}
#[inline(always)]
pub(crate) unsafe fn perform_fft_direct(&self, values: [__m128d; 13]) -> [__m128d; 13] {
let rotate = SseVector::make_rotate90(FftDirection::Inverse);
let y00 = values[0];
let [x1p12, x1m12] = SseVector::column_butterfly2([values[1], values[12]]);
let x1m12 = SseVector::apply_rotate90(rotate, x1m12);
let y00 = SseVector::add(y00, x1p12);
let [x2p11, x2m11] = SseVector::column_butterfly2([values[2], values[11]]);
let x2m11 = SseVector::apply_rotate90(rotate, x2m11);
let y00 = SseVector::add(y00, x2p11);
let [x3p10, x3m10] = SseVector::column_butterfly2([values[3], values[10]]);
let x3m10 = SseVector::apply_rotate90(rotate, x3m10);
let y00 = SseVector::add(y00, x3p10);
let [x4p9, x4m9] = SseVector::column_butterfly2([values[4], values[9]]);
let x4m9 = SseVector::apply_rotate90(rotate, x4m9);
let y00 = SseVector::add(y00, x4p9);
let [x5p8, x5m8] = SseVector::column_butterfly2([values[5], values[8]]);
let x5m8 = SseVector::apply_rotate90(rotate, x5m8);
let y00 = SseVector::add(y00, x5p8);
let [x6p7, x6m7] = SseVector::column_butterfly2([values[6], values[7]]);
let x6m7 = SseVector::apply_rotate90(rotate, x6m7);
let y00 = SseVector::add(y00, x6p7);
let m0112a = SseVector::fmadd(values[0], self.twiddles_re[0], x1p12);
let m0112a = SseVector::fmadd(m0112a, self.twiddles_re[1], x2p11);
let m0112a = SseVector::fmadd(m0112a, self.twiddles_re[2], x3p10);
let m0112a = SseVector::fmadd(m0112a, self.twiddles_re[3], x4p9);
let m0112a = SseVector::fmadd(m0112a, self.twiddles_re[4], x5p8);
let m0112a = SseVector::fmadd(m0112a, self.twiddles_re[5], x6p7);
let m0112b = SseVector::mul(self.twiddles_im[0], x1m12);
let m0112b = SseVector::fmadd(m0112b, self.twiddles_im[1], x2m11);
let m0112b = SseVector::fmadd(m0112b, self.twiddles_im[2], x3m10);
let m0112b = SseVector::fmadd(m0112b, self.twiddles_im[3], x4m9);
let m0112b = SseVector::fmadd(m0112b, self.twiddles_im[4], x5m8);
let m0112b = SseVector::fmadd(m0112b, self.twiddles_im[5], x6m7);
let [y01, y12] = SseVector::column_butterfly2([m0112a, m0112b]);
let m0211a = SseVector::fmadd(values[0], self.twiddles_re[1], x1p12);
let m0211a = SseVector::fmadd(m0211a, self.twiddles_re[3], x2p11);
let m0211a = SseVector::fmadd(m0211a, self.twiddles_re[5], x3p10);
let m0211a = SseVector::fmadd(m0211a, self.twiddles_re[4], x4p9);
let m0211a = SseVector::fmadd(m0211a, self.twiddles_re[2], x5p8);
let m0211a = SseVector::fmadd(m0211a, self.twiddles_re[0], x6p7);
let m0211b = SseVector::mul(self.twiddles_im[1], x1m12);
let m0211b = SseVector::fmadd(m0211b, self.twiddles_im[3], x2m11);
let m0211b = SseVector::fmadd(m0211b, self.twiddles_im[5], x3m10);
let m0211b = SseVector::nmadd(m0211b, self.twiddles_im[4], x4m9);
let m0211b = SseVector::nmadd(m0211b, self.twiddles_im[2], x5m8);
let m0211b = SseVector::nmadd(m0211b, self.twiddles_im[0], x6m7);
let [y02, y11] = SseVector::column_butterfly2([m0211a, m0211b]);
let m0310a = SseVector::fmadd(values[0], self.twiddles_re[2], x1p12);
let m0310a = SseVector::fmadd(m0310a, self.twiddles_re[5], x2p11);
let m0310a = SseVector::fmadd(m0310a, self.twiddles_re[3], x3p10);
let m0310a = SseVector::fmadd(m0310a, self.twiddles_re[0], x4p9);
let m0310a = SseVector::fmadd(m0310a, self.twiddles_re[1], x5p8);
let m0310a = SseVector::fmadd(m0310a, self.twiddles_re[4], x6p7);
let m0310b = SseVector::mul(self.twiddles_im[2], x1m12);
let m0310b = SseVector::fmadd(m0310b, self.twiddles_im[5], x2m11);
let m0310b = SseVector::nmadd(m0310b, self.twiddles_im[3], x3m10);
let m0310b = SseVector::nmadd(m0310b, self.twiddles_im[0], x4m9);
let m0310b = SseVector::fmadd(m0310b, self.twiddles_im[1], x5m8);
let m0310b = SseVector::fmadd(m0310b, self.twiddles_im[4], x6m7);
let [y03, y10] = SseVector::column_butterfly2([m0310a, m0310b]);
let m0409a = SseVector::fmadd(values[0], self.twiddles_re[3], x1p12);
let m0409a = SseVector::fmadd(m0409a, self.twiddles_re[4], x2p11);
let m0409a = SseVector::fmadd(m0409a, self.twiddles_re[0], x3p10);
let m0409a = SseVector::fmadd(m0409a, self.twiddles_re[2], x4p9);
let m0409a = SseVector::fmadd(m0409a, self.twiddles_re[5], x5p8);
let m0409a = SseVector::fmadd(m0409a, self.twiddles_re[1], x6p7);
let m0409b = SseVector::mul(self.twiddles_im[3], x1m12);
let m0409b = SseVector::nmadd(m0409b, self.twiddles_im[4], x2m11);
let m0409b = SseVector::nmadd(m0409b, self.twiddles_im[0], x3m10);
let m0409b = SseVector::fmadd(m0409b, self.twiddles_im[2], x4m9);
let m0409b = SseVector::nmadd(m0409b, self.twiddles_im[5], x5m8);
let m0409b = SseVector::nmadd(m0409b, self.twiddles_im[1], x6m7);
let [y04, y09] = SseVector::column_butterfly2([m0409a, m0409b]);
let m0508a = SseVector::fmadd(values[0], self.twiddles_re[4], x1p12);
let m0508a = SseVector::fmadd(m0508a, self.twiddles_re[2], x2p11);
let m0508a = SseVector::fmadd(m0508a, self.twiddles_re[1], x3p10);
let m0508a = SseVector::fmadd(m0508a, self.twiddles_re[5], x4p9);
let m0508a = SseVector::fmadd(m0508a, self.twiddles_re[0], x5p8);
let m0508a = SseVector::fmadd(m0508a, self.twiddles_re[3], x6p7);
let m0508b = SseVector::mul(self.twiddles_im[4], x1m12);
let m0508b = SseVector::nmadd(m0508b, self.twiddles_im[2], x2m11);
let m0508b = SseVector::fmadd(m0508b, self.twiddles_im[1], x3m10);
let m0508b = SseVector::nmadd(m0508b, self.twiddles_im[5], x4m9);
let m0508b = SseVector::nmadd(m0508b, self.twiddles_im[0], x5m8);
let m0508b = SseVector::fmadd(m0508b, self.twiddles_im[3], x6m7);
let [y05, y08] = SseVector::column_butterfly2([m0508a, m0508b]);
let m0607a = SseVector::fmadd(values[0], self.twiddles_re[5], x1p12);
let m0607a = SseVector::fmadd(m0607a, self.twiddles_re[0], x2p11);
let m0607a = SseVector::fmadd(m0607a, self.twiddles_re[4], x3p10);
let m0607a = SseVector::fmadd(m0607a, self.twiddles_re[1], x4p9);
let m0607a = SseVector::fmadd(m0607a, self.twiddles_re[3], x5p8);
let m0607a = SseVector::fmadd(m0607a, self.twiddles_re[2], x6p7);
let m0607b = SseVector::mul(self.twiddles_im[5], x1m12);
let m0607b = SseVector::nmadd(m0607b, self.twiddles_im[0], x2m11);
let m0607b = SseVector::fmadd(m0607b, self.twiddles_im[4], x3m10);
let m0607b = SseVector::nmadd(m0607b, self.twiddles_im[1], x4m9);
let m0607b = SseVector::fmadd(m0607b, self.twiddles_im[3], x5m8);
let m0607b = SseVector::nmadd(m0607b, self.twiddles_im[2], x6m7);
let [y06, y07] = SseVector::column_butterfly2([m0607a, m0607b]);
[y00, y01, y02, y03, y04, y05, y06, y07, y08, y09, y10, y11, y12]
}
}
struct SseF32Butterfly17<T> {
direction: FftDirection,
twiddles_re: [__m128; 8],
twiddles_im: [__m128; 8],
_phantom: std::marker::PhantomData<T>,
}
boilerplate_fft_sse_f32_butterfly!(SseF32Butterfly17, 17, |this: &SseF32Butterfly17<_>| this.direction);
impl<T: FftNum> SseF32Butterfly17<T> {
#[target_feature(enable = "sse4.1")]
unsafe fn new(direction: FftDirection) -> Self {
assert_f32::<T>();
let twiddles = make_twiddles(17, direction);
Self {
direction,
twiddles_re: twiddles.map(|t| SseVector::broadcast_scalar(t.re)),
twiddles_im: twiddles.map(|t| SseVector::broadcast_scalar(t.im)),
_phantom: std::marker::PhantomData,
}
}
#[inline(always)]
pub(crate) unsafe fn perform_fft_contiguous(&self, mut buffer: impl SseArrayMut<f32>) {
let values = read_partial1_complex_to_array!(buffer, { 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16 });
let out = self.perform_parallel_fft_direct(values);
write_partial_lo_complex_to_array!(out, buffer, { 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16 } );
}
#[inline(always)]
pub(crate) unsafe fn perform_parallel_fft_contiguous(&self, mut buffer: impl SseArrayMut<f32>) {
let input_packed = read_complex_to_array!(buffer, { 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32 });
let values = [
extract_lo_hi_f32(input_packed[0], input_packed[8]),
extract_hi_lo_f32(input_packed[0], input_packed[9]),
extract_lo_hi_f32(input_packed[1], input_packed[9]),
extract_hi_lo_f32(input_packed[1], input_packed[10]),
extract_lo_hi_f32(input_packed[2], input_packed[10]),
extract_hi_lo_f32(input_packed[2], input_packed[11]),
extract_lo_hi_f32(input_packed[3], input_packed[11]),
extract_hi_lo_f32(input_packed[3], input_packed[12]),
extract_lo_hi_f32(input_packed[4], input_packed[12]),
extract_hi_lo_f32(input_packed[4], input_packed[13]),
extract_lo_hi_f32(input_packed[5], input_packed[13]),
extract_hi_lo_f32(input_packed[5], input_packed[14]),
extract_lo_hi_f32(input_packed[6], input_packed[14]),
extract_hi_lo_f32(input_packed[6], input_packed[15]),
extract_lo_hi_f32(input_packed[7], input_packed[15]),
extract_hi_lo_f32(input_packed[7], input_packed[16]),
extract_lo_hi_f32(input_packed[8], input_packed[16]),
];
let out = self.perform_parallel_fft_direct(values);
let out_packed = [
extract_lo_lo_f32(out[0], out[1]),
extract_lo_lo_f32(out[2], out[3]),
extract_lo_lo_f32(out[4], out[5]),
extract_lo_lo_f32(out[6], out[7]),
extract_lo_lo_f32(out[8], out[9]),
extract_lo_lo_f32(out[10], out[11]),
extract_lo_lo_f32(out[12], out[13]),
extract_lo_lo_f32(out[14], out[15]),
extract_lo_hi_f32(out[16], out[0]),
extract_hi_hi_f32(out[1], out[2]),
extract_hi_hi_f32(out[3], out[4]),
extract_hi_hi_f32(out[5], out[6]),
extract_hi_hi_f32(out[7], out[8]),
extract_hi_hi_f32(out[9], out[10]),
extract_hi_hi_f32(out[11], out[12]),
extract_hi_hi_f32(out[13], out[14]),
extract_hi_hi_f32(out[15], out[16]),
];
write_complex_to_array_strided!(out_packed, buffer, 2, { 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16 });
}
#[inline(always)]
pub(crate) unsafe fn perform_parallel_fft_direct(&self, values: [__m128; 17]) -> [__m128; 17] {
let rotate = SseVector::make_rotate90(FftDirection::Inverse);
let y00 = values[0];
let [x1p16, x1m16] = SseVector::column_butterfly2([values[1], values[16]]);
let x1m16 = SseVector::apply_rotate90(rotate, x1m16);
let y00 = SseVector::add(y00, x1p16);
let [x2p15, x2m15] = SseVector::column_butterfly2([values[2], values[15]]);
let x2m15 = SseVector::apply_rotate90(rotate, x2m15);
let y00 = SseVector::add(y00, x2p15);
let [x3p14, x3m14] = SseVector::column_butterfly2([values[3], values[14]]);
let x3m14 = SseVector::apply_rotate90(rotate, x3m14);
let y00 = SseVector::add(y00, x3p14);
let [x4p13, x4m13] = SseVector::column_butterfly2([values[4], values[13]]);
let x4m13 = SseVector::apply_rotate90(rotate, x4m13);
let y00 = SseVector::add(y00, x4p13);
let [x5p12, x5m12] = SseVector::column_butterfly2([values[5], values[12]]);
let x5m12 = SseVector::apply_rotate90(rotate, x5m12);
let y00 = SseVector::add(y00, x5p12);
let [x6p11, x6m11] = SseVector::column_butterfly2([values[6], values[11]]);
let x6m11 = SseVector::apply_rotate90(rotate, x6m11);
let y00 = SseVector::add(y00, x6p11);
let [x7p10, x7m10] = SseVector::column_butterfly2([values[7], values[10]]);
let x7m10 = SseVector::apply_rotate90(rotate, x7m10);
let y00 = SseVector::add(y00, x7p10);
let [x8p9, x8m9] = SseVector::column_butterfly2([values[8], values[9]]);
let x8m9 = SseVector::apply_rotate90(rotate, x8m9);
let y00 = SseVector::add(y00, x8p9);
let m0116a = SseVector::fmadd(values[0], self.twiddles_re[0], x1p16);
let m0116a = SseVector::fmadd(m0116a, self.twiddles_re[1], x2p15);
let m0116a = SseVector::fmadd(m0116a, self.twiddles_re[2], x3p14);
let m0116a = SseVector::fmadd(m0116a, self.twiddles_re[3], x4p13);
let m0116a = SseVector::fmadd(m0116a, self.twiddles_re[4], x5p12);
let m0116a = SseVector::fmadd(m0116a, self.twiddles_re[5], x6p11);
let m0116a = SseVector::fmadd(m0116a, self.twiddles_re[6], x7p10);
let m0116a = SseVector::fmadd(m0116a, self.twiddles_re[7], x8p9);
let m0116b = SseVector::mul(self.twiddles_im[0], x1m16);
let m0116b = SseVector::fmadd(m0116b, self.twiddles_im[1], x2m15);
let m0116b = SseVector::fmadd(m0116b, self.twiddles_im[2], x3m14);
let m0116b = SseVector::fmadd(m0116b, self.twiddles_im[3], x4m13);
let m0116b = SseVector::fmadd(m0116b, self.twiddles_im[4], x5m12);
let m0116b = SseVector::fmadd(m0116b, self.twiddles_im[5], x6m11);
let m0116b = SseVector::fmadd(m0116b, self.twiddles_im[6], x7m10);
let m0116b = SseVector::fmadd(m0116b, self.twiddles_im[7], x8m9);
let [y01, y16] = SseVector::column_butterfly2([m0116a, m0116b]);
let m0215a = SseVector::fmadd(values[0], self.twiddles_re[1], x1p16);
let m0215a = SseVector::fmadd(m0215a, self.twiddles_re[3], x2p15);
let m0215a = SseVector::fmadd(m0215a, self.twiddles_re[5], x3p14);
let m0215a = SseVector::fmadd(m0215a, self.twiddles_re[7], x4p13);
let m0215a = SseVector::fmadd(m0215a, self.twiddles_re[6], x5p12);
let m0215a = SseVector::fmadd(m0215a, self.twiddles_re[4], x6p11);
let m0215a = SseVector::fmadd(m0215a, self.twiddles_re[2], x7p10);
let m0215a = SseVector::fmadd(m0215a, self.twiddles_re[0], x8p9);
let m0215b = SseVector::mul(self.twiddles_im[1], x1m16);
let m0215b = SseVector::fmadd(m0215b, self.twiddles_im[3], x2m15);
let m0215b = SseVector::fmadd(m0215b, self.twiddles_im[5], x3m14);
let m0215b = SseVector::fmadd(m0215b, self.twiddles_im[7], x4m13);
let m0215b = SseVector::nmadd(m0215b, self.twiddles_im[6], x5m12);
let m0215b = SseVector::nmadd(m0215b, self.twiddles_im[4], x6m11);
let m0215b = SseVector::nmadd(m0215b, self.twiddles_im[2], x7m10);
let m0215b = SseVector::nmadd(m0215b, self.twiddles_im[0], x8m9);
let [y02, y15] = SseVector::column_butterfly2([m0215a, m0215b]);
let m0314a = SseVector::fmadd(values[0], self.twiddles_re[2], x1p16);
let m0314a = SseVector::fmadd(m0314a, self.twiddles_re[5], x2p15);
let m0314a = SseVector::fmadd(m0314a, self.twiddles_re[7], x3p14);
let m0314a = SseVector::fmadd(m0314a, self.twiddles_re[4], x4p13);
let m0314a = SseVector::fmadd(m0314a, self.twiddles_re[1], x5p12);
let m0314a = SseVector::fmadd(m0314a, self.twiddles_re[0], x6p11);
let m0314a = SseVector::fmadd(m0314a, self.twiddles_re[3], x7p10);
let m0314a = SseVector::fmadd(m0314a, self.twiddles_re[6], x8p9);
let m0314b = SseVector::mul(self.twiddles_im[2], x1m16);
let m0314b = SseVector::fmadd(m0314b, self.twiddles_im[5], x2m15);
let m0314b = SseVector::nmadd(m0314b, self.twiddles_im[7], x3m14);
let m0314b = SseVector::nmadd(m0314b, self.twiddles_im[4], x4m13);
let m0314b = SseVector::nmadd(m0314b, self.twiddles_im[1], x5m12);
let m0314b = SseVector::fmadd(m0314b, self.twiddles_im[0], x6m11);
let m0314b = SseVector::fmadd(m0314b, self.twiddles_im[3], x7m10);
let m0314b = SseVector::fmadd(m0314b, self.twiddles_im[6], x8m9);
let [y03, y14] = SseVector::column_butterfly2([m0314a, m0314b]);
let m0413a = SseVector::fmadd(values[0], self.twiddles_re[3], x1p16);
let m0413a = SseVector::fmadd(m0413a, self.twiddles_re[7], x2p15);
let m0413a = SseVector::fmadd(m0413a, self.twiddles_re[4], x3p14);
let m0413a = SseVector::fmadd(m0413a, self.twiddles_re[0], x4p13);
let m0413a = SseVector::fmadd(m0413a, self.twiddles_re[2], x5p12);
let m0413a = SseVector::fmadd(m0413a, self.twiddles_re[6], x6p11);
let m0413a = SseVector::fmadd(m0413a, self.twiddles_re[5], x7p10);
let m0413a = SseVector::fmadd(m0413a, self.twiddles_re[1], x8p9);
let m0413b = SseVector::mul(self.twiddles_im[3], x1m16);
let m0413b = SseVector::fmadd(m0413b, self.twiddles_im[7], x2m15);
let m0413b = SseVector::nmadd(m0413b, self.twiddles_im[4], x3m14);
let m0413b = SseVector::nmadd(m0413b, self.twiddles_im[0], x4m13);
let m0413b = SseVector::fmadd(m0413b, self.twiddles_im[2], x5m12);
let m0413b = SseVector::fmadd(m0413b, self.twiddles_im[6], x6m11);
let m0413b = SseVector::nmadd(m0413b, self.twiddles_im[5], x7m10);
let m0413b = SseVector::nmadd(m0413b, self.twiddles_im[1], x8m9);
let [y04, y13] = SseVector::column_butterfly2([m0413a, m0413b]);
let m0512a = SseVector::fmadd(values[0], self.twiddles_re[4], x1p16);
let m0512a = SseVector::fmadd(m0512a, self.twiddles_re[6], x2p15);
let m0512a = SseVector::fmadd(m0512a, self.twiddles_re[1], x3p14);
let m0512a = SseVector::fmadd(m0512a, self.twiddles_re[2], x4p13);
let m0512a = SseVector::fmadd(m0512a, self.twiddles_re[7], x5p12);
let m0512a = SseVector::fmadd(m0512a, self.twiddles_re[3], x6p11);
let m0512a = SseVector::fmadd(m0512a, self.twiddles_re[0], x7p10);
let m0512a = SseVector::fmadd(m0512a, self.twiddles_re[5], x8p9);
let m0512b = SseVector::mul(self.twiddles_im[4], x1m16);
let m0512b = SseVector::nmadd(m0512b, self.twiddles_im[6], x2m15);
let m0512b = SseVector::nmadd(m0512b, self.twiddles_im[1], x3m14);
let m0512b = SseVector::fmadd(m0512b, self.twiddles_im[2], x4m13);
let m0512b = SseVector::fmadd(m0512b, self.twiddles_im[7], x5m12);
let m0512b = SseVector::nmadd(m0512b, self.twiddles_im[3], x6m11);
let m0512b = SseVector::fmadd(m0512b, self.twiddles_im[0], x7m10);
let m0512b = SseVector::fmadd(m0512b, self.twiddles_im[5], x8m9);
let [y05, y12] = SseVector::column_butterfly2([m0512a, m0512b]);
let m0611a = SseVector::fmadd(values[0], self.twiddles_re[5], x1p16);
let m0611a = SseVector::fmadd(m0611a, self.twiddles_re[4], x2p15);
let m0611a = SseVector::fmadd(m0611a, self.twiddles_re[0], x3p14);
let m0611a = SseVector::fmadd(m0611a, self.twiddles_re[6], x4p13);
let m0611a = SseVector::fmadd(m0611a, self.twiddles_re[3], x5p12);
let m0611a = SseVector::fmadd(m0611a, self.twiddles_re[1], x6p11);
let m0611a = SseVector::fmadd(m0611a, self.twiddles_re[7], x7p10);
let m0611a = SseVector::fmadd(m0611a, self.twiddles_re[2], x8p9);
let m0611b = SseVector::mul(self.twiddles_im[5], x1m16);
let m0611b = SseVector::nmadd(m0611b, self.twiddles_im[4], x2m15);
let m0611b = SseVector::fmadd(m0611b, self.twiddles_im[0], x3m14);
let m0611b = SseVector::fmadd(m0611b, self.twiddles_im[6], x4m13);
let m0611b = SseVector::nmadd(m0611b, self.twiddles_im[3], x5m12);
let m0611b = SseVector::fmadd(m0611b, self.twiddles_im[1], x6m11);
let m0611b = SseVector::fmadd(m0611b, self.twiddles_im[7], x7m10);
let m0611b = SseVector::nmadd(m0611b, self.twiddles_im[2], x8m9);
let [y06, y11] = SseVector::column_butterfly2([m0611a, m0611b]);
let m0710a = SseVector::fmadd(values[0], self.twiddles_re[6], x1p16);
let m0710a = SseVector::fmadd(m0710a, self.twiddles_re[2], x2p15);
let m0710a = SseVector::fmadd(m0710a, self.twiddles_re[3], x3p14);
let m0710a = SseVector::fmadd(m0710a, self.twiddles_re[5], x4p13);
let m0710a = SseVector::fmadd(m0710a, self.twiddles_re[0], x5p12);
let m0710a = SseVector::fmadd(m0710a, self.twiddles_re[7], x6p11);
let m0710a = SseVector::fmadd(m0710a, self.twiddles_re[1], x7p10);
let m0710a = SseVector::fmadd(m0710a, self.twiddles_re[4], x8p9);
let m0710b = SseVector::mul(self.twiddles_im[6], x1m16);
let m0710b = SseVector::nmadd(m0710b, self.twiddles_im[2], x2m15);
let m0710b = SseVector::fmadd(m0710b, self.twiddles_im[3], x3m14);
let m0710b = SseVector::nmadd(m0710b, self.twiddles_im[5], x4m13);
let m0710b = SseVector::fmadd(m0710b, self.twiddles_im[0], x5m12);
let m0710b = SseVector::fmadd(m0710b, self.twiddles_im[7], x6m11);
let m0710b = SseVector::nmadd(m0710b, self.twiddles_im[1], x7m10);
let m0710b = SseVector::fmadd(m0710b, self.twiddles_im[4], x8m9);
let [y07, y10] = SseVector::column_butterfly2([m0710a, m0710b]);
let m0809a = SseVector::fmadd(values[0], self.twiddles_re[7], x1p16);
let m0809a = SseVector::fmadd(m0809a, self.twiddles_re[0], x2p15);
let m0809a = SseVector::fmadd(m0809a, self.twiddles_re[6], x3p14);
let m0809a = SseVector::fmadd(m0809a, self.twiddles_re[1], x4p13);
let m0809a = SseVector::fmadd(m0809a, self.twiddles_re[5], x5p12);
let m0809a = SseVector::fmadd(m0809a, self.twiddles_re[2], x6p11);
let m0809a = SseVector::fmadd(m0809a, self.twiddles_re[4], x7p10);
let m0809a = SseVector::fmadd(m0809a, self.twiddles_re[3], x8p9);
let m0809b = SseVector::mul(self.twiddles_im[7], x1m16);
let m0809b = SseVector::nmadd(m0809b, self.twiddles_im[0], x2m15);
let m0809b = SseVector::fmadd(m0809b, self.twiddles_im[6], x3m14);
let m0809b = SseVector::nmadd(m0809b, self.twiddles_im[1], x4m13);
let m0809b = SseVector::fmadd(m0809b, self.twiddles_im[5], x5m12);
let m0809b = SseVector::nmadd(m0809b, self.twiddles_im[2], x6m11);
let m0809b = SseVector::fmadd(m0809b, self.twiddles_im[4], x7m10);
let m0809b = SseVector::nmadd(m0809b, self.twiddles_im[3], x8m9);
let [y08, y09] = SseVector::column_butterfly2([m0809a, m0809b]);
[y00, y01, y02, y03, y04, y05, y06, y07, y08, y09, y10, y11, y12, y13, y14, y15, y16]
}
}
struct SseF64Butterfly17<T> {
direction: FftDirection,
twiddles_re: [__m128d; 8],
twiddles_im: [__m128d; 8],
_phantom: std::marker::PhantomData<T>,
}
boilerplate_fft_sse_f64_butterfly!(SseF64Butterfly17, 17, |this: &SseF64Butterfly17<_>| this.direction);
impl<T: FftNum> SseF64Butterfly17<T> {
#[target_feature(enable = "sse4.1")]
unsafe fn new(direction: FftDirection) -> Self {
assert_f64::<T>();
let twiddles = make_twiddles(17, direction);
unsafe {Self {
direction,
twiddles_re: twiddles.map(|t| SseVector::broadcast_scalar(t.re)),
twiddles_im: twiddles.map(|t| SseVector::broadcast_scalar(t.im)),
_phantom: std::marker::PhantomData,
}}
}
#[inline(always)]
pub(crate) unsafe fn perform_fft_contiguous(&self, mut buffer: impl SseArrayMut<f64>) {
let values = read_complex_to_array!(buffer, { 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16 });
let out = self.perform_fft_direct(values);
write_complex_to_array!(out, buffer, { 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16 });
}
#[inline(always)]
pub(crate) unsafe fn perform_fft_direct(&self, values: [__m128d; 17]) -> [__m128d; 17] {
let rotate = SseVector::make_rotate90(FftDirection::Inverse);
let y00 = values[0];
let [x1p16, x1m16] = SseVector::column_butterfly2([values[1], values[16]]);
let x1m16 = SseVector::apply_rotate90(rotate, x1m16);
let y00 = SseVector::add(y00, x1p16);
let [x2p15, x2m15] = SseVector::column_butterfly2([values[2], values[15]]);
let x2m15 = SseVector::apply_rotate90(rotate, x2m15);
let y00 = SseVector::add(y00, x2p15);
let [x3p14, x3m14] = SseVector::column_butterfly2([values[3], values[14]]);
let x3m14 = SseVector::apply_rotate90(rotate, x3m14);
let y00 = SseVector::add(y00, x3p14);
let [x4p13, x4m13] = SseVector::column_butterfly2([values[4], values[13]]);
let x4m13 = SseVector::apply_rotate90(rotate, x4m13);
let y00 = SseVector::add(y00, x4p13);
let [x5p12, x5m12] = SseVector::column_butterfly2([values[5], values[12]]);
let x5m12 = SseVector::apply_rotate90(rotate, x5m12);
let y00 = SseVector::add(y00, x5p12);
let [x6p11, x6m11] = SseVector::column_butterfly2([values[6], values[11]]);
let x6m11 = SseVector::apply_rotate90(rotate, x6m11);
let y00 = SseVector::add(y00, x6p11);
let [x7p10, x7m10] = SseVector::column_butterfly2([values[7], values[10]]);
let x7m10 = SseVector::apply_rotate90(rotate, x7m10);
let y00 = SseVector::add(y00, x7p10);
let [x8p9, x8m9] = SseVector::column_butterfly2([values[8], values[9]]);
let x8m9 = SseVector::apply_rotate90(rotate, x8m9);
let y00 = SseVector::add(y00, x8p9);
let m0116a = SseVector::fmadd(values[0], self.twiddles_re[0], x1p16);
let m0116a = SseVector::fmadd(m0116a, self.twiddles_re[1], x2p15);
let m0116a = SseVector::fmadd(m0116a, self.twiddles_re[2], x3p14);
let m0116a = SseVector::fmadd(m0116a, self.twiddles_re[3], x4p13);
let m0116a = SseVector::fmadd(m0116a, self.twiddles_re[4], x5p12);
let m0116a = SseVector::fmadd(m0116a, self.twiddles_re[5], x6p11);
let m0116a = SseVector::fmadd(m0116a, self.twiddles_re[6], x7p10);
let m0116a = SseVector::fmadd(m0116a, self.twiddles_re[7], x8p9);
let m0116b = SseVector::mul(self.twiddles_im[0], x1m16);
let m0116b = SseVector::fmadd(m0116b, self.twiddles_im[1], x2m15);
let m0116b = SseVector::fmadd(m0116b, self.twiddles_im[2], x3m14);
let m0116b = SseVector::fmadd(m0116b, self.twiddles_im[3], x4m13);
let m0116b = SseVector::fmadd(m0116b, self.twiddles_im[4], x5m12);
let m0116b = SseVector::fmadd(m0116b, self.twiddles_im[5], x6m11);
let m0116b = SseVector::fmadd(m0116b, self.twiddles_im[6], x7m10);
let m0116b = SseVector::fmadd(m0116b, self.twiddles_im[7], x8m9);
let [y01, y16] = SseVector::column_butterfly2([m0116a, m0116b]);
let m0215a = SseVector::fmadd(values[0], self.twiddles_re[1], x1p16);
let m0215a = SseVector::fmadd(m0215a, self.twiddles_re[3], x2p15);
let m0215a = SseVector::fmadd(m0215a, self.twiddles_re[5], x3p14);
let m0215a = SseVector::fmadd(m0215a, self.twiddles_re[7], x4p13);
let m0215a = SseVector::fmadd(m0215a, self.twiddles_re[6], x5p12);
let m0215a = SseVector::fmadd(m0215a, self.twiddles_re[4], x6p11);
let m0215a = SseVector::fmadd(m0215a, self.twiddles_re[2], x7p10);
let m0215a = SseVector::fmadd(m0215a, self.twiddles_re[0], x8p9);
let m0215b = SseVector::mul(self.twiddles_im[1], x1m16);
let m0215b = SseVector::fmadd(m0215b, self.twiddles_im[3], x2m15);
let m0215b = SseVector::fmadd(m0215b, self.twiddles_im[5], x3m14);
let m0215b = SseVector::fmadd(m0215b, self.twiddles_im[7], x4m13);
let m0215b = SseVector::nmadd(m0215b, self.twiddles_im[6], x5m12);
let m0215b = SseVector::nmadd(m0215b, self.twiddles_im[4], x6m11);
let m0215b = SseVector::nmadd(m0215b, self.twiddles_im[2], x7m10);
let m0215b = SseVector::nmadd(m0215b, self.twiddles_im[0], x8m9);
let [y02, y15] = SseVector::column_butterfly2([m0215a, m0215b]);
let m0314a = SseVector::fmadd(values[0], self.twiddles_re[2], x1p16);
let m0314a = SseVector::fmadd(m0314a, self.twiddles_re[5], x2p15);
let m0314a = SseVector::fmadd(m0314a, self.twiddles_re[7], x3p14);
let m0314a = SseVector::fmadd(m0314a, self.twiddles_re[4], x4p13);
let m0314a = SseVector::fmadd(m0314a, self.twiddles_re[1], x5p12);
let m0314a = SseVector::fmadd(m0314a, self.twiddles_re[0], x6p11);
let m0314a = SseVector::fmadd(m0314a, self.twiddles_re[3], x7p10);
let m0314a = SseVector::fmadd(m0314a, self.twiddles_re[6], x8p9);
let m0314b = SseVector::mul(self.twiddles_im[2], x1m16);
let m0314b = SseVector::fmadd(m0314b, self.twiddles_im[5], x2m15);
let m0314b = SseVector::nmadd(m0314b, self.twiddles_im[7], x3m14);
let m0314b = SseVector::nmadd(m0314b, self.twiddles_im[4], x4m13);
let m0314b = SseVector::nmadd(m0314b, self.twiddles_im[1], x5m12);
let m0314b = SseVector::fmadd(m0314b, self.twiddles_im[0], x6m11);
let m0314b = SseVector::fmadd(m0314b, self.twiddles_im[3], x7m10);
let m0314b = SseVector::fmadd(m0314b, self.twiddles_im[6], x8m9);
let [y03, y14] = SseVector::column_butterfly2([m0314a, m0314b]);
let m0413a = SseVector::fmadd(values[0], self.twiddles_re[3], x1p16);
let m0413a = SseVector::fmadd(m0413a, self.twiddles_re[7], x2p15);
let m0413a = SseVector::fmadd(m0413a, self.twiddles_re[4], x3p14);
let m0413a = SseVector::fmadd(m0413a, self.twiddles_re[0], x4p13);
let m0413a = SseVector::fmadd(m0413a, self.twiddles_re[2], x5p12);
let m0413a = SseVector::fmadd(m0413a, self.twiddles_re[6], x6p11);
let m0413a = SseVector::fmadd(m0413a, self.twiddles_re[5], x7p10);
let m0413a = SseVector::fmadd(m0413a, self.twiddles_re[1], x8p9);
let m0413b = SseVector::mul(self.twiddles_im[3], x1m16);
let m0413b = SseVector::fmadd(m0413b, self.twiddles_im[7], x2m15);
let m0413b = SseVector::nmadd(m0413b, self.twiddles_im[4], x3m14);
let m0413b = SseVector::nmadd(m0413b, self.twiddles_im[0], x4m13);
let m0413b = SseVector::fmadd(m0413b, self.twiddles_im[2], x5m12);
let m0413b = SseVector::fmadd(m0413b, self.twiddles_im[6], x6m11);
let m0413b = SseVector::nmadd(m0413b, self.twiddles_im[5], x7m10);
let m0413b = SseVector::nmadd(m0413b, self.twiddles_im[1], x8m9);
let [y04, y13] = SseVector::column_butterfly2([m0413a, m0413b]);
let m0512a = SseVector::fmadd(values[0], self.twiddles_re[4], x1p16);
let m0512a = SseVector::fmadd(m0512a, self.twiddles_re[6], x2p15);
let m0512a = SseVector::fmadd(m0512a, self.twiddles_re[1], x3p14);
let m0512a = SseVector::fmadd(m0512a, self.twiddles_re[2], x4p13);
let m0512a = SseVector::fmadd(m0512a, self.twiddles_re[7], x5p12);
let m0512a = SseVector::fmadd(m0512a, self.twiddles_re[3], x6p11);
let m0512a = SseVector::fmadd(m0512a, self.twiddles_re[0], x7p10);
let m0512a = SseVector::fmadd(m0512a, self.twiddles_re[5], x8p9);
let m0512b = SseVector::mul(self.twiddles_im[4], x1m16);
let m0512b = SseVector::nmadd(m0512b, self.twiddles_im[6], x2m15);
let m0512b = SseVector::nmadd(m0512b, self.twiddles_im[1], x3m14);
let m0512b = SseVector::fmadd(m0512b, self.twiddles_im[2], x4m13);
let m0512b = SseVector::fmadd(m0512b, self.twiddles_im[7], x5m12);
let m0512b = SseVector::nmadd(m0512b, self.twiddles_im[3], x6m11);
let m0512b = SseVector::fmadd(m0512b, self.twiddles_im[0], x7m10);
let m0512b = SseVector::fmadd(m0512b, self.twiddles_im[5], x8m9);
let [y05, y12] = SseVector::column_butterfly2([m0512a, m0512b]);
let m0611a = SseVector::fmadd(values[0], self.twiddles_re[5], x1p16);
let m0611a = SseVector::fmadd(m0611a, self.twiddles_re[4], x2p15);
let m0611a = SseVector::fmadd(m0611a, self.twiddles_re[0], x3p14);
let m0611a = SseVector::fmadd(m0611a, self.twiddles_re[6], x4p13);
let m0611a = SseVector::fmadd(m0611a, self.twiddles_re[3], x5p12);
let m0611a = SseVector::fmadd(m0611a, self.twiddles_re[1], x6p11);
let m0611a = SseVector::fmadd(m0611a, self.twiddles_re[7], x7p10);
let m0611a = SseVector::fmadd(m0611a, self.twiddles_re[2], x8p9);
let m0611b = SseVector::mul(self.twiddles_im[5], x1m16);
let m0611b = SseVector::nmadd(m0611b, self.twiddles_im[4], x2m15);
let m0611b = SseVector::fmadd(m0611b, self.twiddles_im[0], x3m14);
let m0611b = SseVector::fmadd(m0611b, self.twiddles_im[6], x4m13);
let m0611b = SseVector::nmadd(m0611b, self.twiddles_im[3], x5m12);
let m0611b = SseVector::fmadd(m0611b, self.twiddles_im[1], x6m11);
let m0611b = SseVector::fmadd(m0611b, self.twiddles_im[7], x7m10);
let m0611b = SseVector::nmadd(m0611b, self.twiddles_im[2], x8m9);
let [y06, y11] = SseVector::column_butterfly2([m0611a, m0611b]);
let m0710a = SseVector::fmadd(values[0], self.twiddles_re[6], x1p16);
let m0710a = SseVector::fmadd(m0710a, self.twiddles_re[2], x2p15);
let m0710a = SseVector::fmadd(m0710a, self.twiddles_re[3], x3p14);
let m0710a = SseVector::fmadd(m0710a, self.twiddles_re[5], x4p13);
let m0710a = SseVector::fmadd(m0710a, self.twiddles_re[0], x5p12);
let m0710a = SseVector::fmadd(m0710a, self.twiddles_re[7], x6p11);
let m0710a = SseVector::fmadd(m0710a, self.twiddles_re[1], x7p10);
let m0710a = SseVector::fmadd(m0710a, self.twiddles_re[4], x8p9);
let m0710b = SseVector::mul(self.twiddles_im[6], x1m16);
let m0710b = SseVector::nmadd(m0710b, self.twiddles_im[2], x2m15);
let m0710b = SseVector::fmadd(m0710b, self.twiddles_im[3], x3m14);
let m0710b = SseVector::nmadd(m0710b, self.twiddles_im[5], x4m13);
let m0710b = SseVector::fmadd(m0710b, self.twiddles_im[0], x5m12);
let m0710b = SseVector::fmadd(m0710b, self.twiddles_im[7], x6m11);
let m0710b = SseVector::nmadd(m0710b, self.twiddles_im[1], x7m10);
let m0710b = SseVector::fmadd(m0710b, self.twiddles_im[4], x8m9);
let [y07, y10] = SseVector::column_butterfly2([m0710a, m0710b]);
let m0809a = SseVector::fmadd(values[0], self.twiddles_re[7], x1p16);
let m0809a = SseVector::fmadd(m0809a, self.twiddles_re[0], x2p15);
let m0809a = SseVector::fmadd(m0809a, self.twiddles_re[6], x3p14);
let m0809a = SseVector::fmadd(m0809a, self.twiddles_re[1], x4p13);
let m0809a = SseVector::fmadd(m0809a, self.twiddles_re[5], x5p12);
let m0809a = SseVector::fmadd(m0809a, self.twiddles_re[2], x6p11);
let m0809a = SseVector::fmadd(m0809a, self.twiddles_re[4], x7p10);
let m0809a = SseVector::fmadd(m0809a, self.twiddles_re[3], x8p9);
let m0809b = SseVector::mul(self.twiddles_im[7], x1m16);
let m0809b = SseVector::nmadd(m0809b, self.twiddles_im[0], x2m15);
let m0809b = SseVector::fmadd(m0809b, self.twiddles_im[6], x3m14);
let m0809b = SseVector::nmadd(m0809b, self.twiddles_im[1], x4m13);
let m0809b = SseVector::fmadd(m0809b, self.twiddles_im[5], x5m12);
let m0809b = SseVector::nmadd(m0809b, self.twiddles_im[2], x6m11);
let m0809b = SseVector::fmadd(m0809b, self.twiddles_im[4], x7m10);
let m0809b = SseVector::nmadd(m0809b, self.twiddles_im[3], x8m9);
let [y08, y09] = SseVector::column_butterfly2([m0809a, m0809b]);
[y00, y01, y02, y03, y04, y05, y06, y07, y08, y09, y10, y11, y12, y13, y14, y15, y16]
}
}
struct SseF32Butterfly19<T> {
direction: FftDirection,
twiddles_re: [__m128; 9],
twiddles_im: [__m128; 9],
_phantom: std::marker::PhantomData<T>,
}
boilerplate_fft_sse_f32_butterfly!(SseF32Butterfly19, 19, |this: &SseF32Butterfly19<_>| this.direction);
impl<T: FftNum> SseF32Butterfly19<T> {
#[target_feature(enable = "sse4.1")]
unsafe fn new(direction: FftDirection) -> Self {
assert_f32::<T>();
let twiddles = make_twiddles(19, direction);
Self {
direction,
twiddles_re: twiddles.map(|t| SseVector::broadcast_scalar(t.re)),
twiddles_im: twiddles.map(|t| SseVector::broadcast_scalar(t.im)),
_phantom: std::marker::PhantomData,
}
}
#[inline(always)]
pub(crate) unsafe fn perform_fft_contiguous(&self, mut buffer: impl SseArrayMut<f32>) {
let values = read_partial1_complex_to_array!(buffer, { 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18 });
let out = self.perform_parallel_fft_direct(values);
write_partial_lo_complex_to_array!(out, buffer, { 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18 } );
}
#[inline(always)]
pub(crate) unsafe fn perform_parallel_fft_contiguous(&self, mut buffer: impl SseArrayMut<f32>) {
let input_packed = read_complex_to_array!(buffer, { 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36 });
let values = [
extract_lo_hi_f32(input_packed[0], input_packed[9]),
extract_hi_lo_f32(input_packed[0], input_packed[10]),
extract_lo_hi_f32(input_packed[1], input_packed[10]),
extract_hi_lo_f32(input_packed[1], input_packed[11]),
extract_lo_hi_f32(input_packed[2], input_packed[11]),
extract_hi_lo_f32(input_packed[2], input_packed[12]),
extract_lo_hi_f32(input_packed[3], input_packed[12]),
extract_hi_lo_f32(input_packed[3], input_packed[13]),
extract_lo_hi_f32(input_packed[4], input_packed[13]),
extract_hi_lo_f32(input_packed[4], input_packed[14]),
extract_lo_hi_f32(input_packed[5], input_packed[14]),
extract_hi_lo_f32(input_packed[5], input_packed[15]),
extract_lo_hi_f32(input_packed[6], input_packed[15]),
extract_hi_lo_f32(input_packed[6], input_packed[16]),
extract_lo_hi_f32(input_packed[7], input_packed[16]),
extract_hi_lo_f32(input_packed[7], input_packed[17]),
extract_lo_hi_f32(input_packed[8], input_packed[17]),
extract_hi_lo_f32(input_packed[8], input_packed[18]),
extract_lo_hi_f32(input_packed[9], input_packed[18]),
];
let out = self.perform_parallel_fft_direct(values);
let out_packed = [
extract_lo_lo_f32(out[0], out[1]),
extract_lo_lo_f32(out[2], out[3]),
extract_lo_lo_f32(out[4], out[5]),
extract_lo_lo_f32(out[6], out[7]),
extract_lo_lo_f32(out[8], out[9]),
extract_lo_lo_f32(out[10], out[11]),
extract_lo_lo_f32(out[12], out[13]),
extract_lo_lo_f32(out[14], out[15]),
extract_lo_lo_f32(out[16], out[17]),
extract_lo_hi_f32(out[18], out[0]),
extract_hi_hi_f32(out[1], out[2]),
extract_hi_hi_f32(out[3], out[4]),
extract_hi_hi_f32(out[5], out[6]),
extract_hi_hi_f32(out[7], out[8]),
extract_hi_hi_f32(out[9], out[10]),
extract_hi_hi_f32(out[11], out[12]),
extract_hi_hi_f32(out[13], out[14]),
extract_hi_hi_f32(out[15], out[16]),
extract_hi_hi_f32(out[17], out[18]),
];
write_complex_to_array_strided!(out_packed, buffer, 2, { 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18 });
}
#[inline(always)]
pub(crate) unsafe fn perform_parallel_fft_direct(&self, values: [__m128; 19]) -> [__m128; 19] {
let rotate = SseVector::make_rotate90(FftDirection::Inverse);
let y00 = values[0];
let [x1p18, x1m18] = SseVector::column_butterfly2([values[1], values[18]]);
let x1m18 = SseVector::apply_rotate90(rotate, x1m18);
let y00 = SseVector::add(y00, x1p18);
let [x2p17, x2m17] = SseVector::column_butterfly2([values[2], values[17]]);
let x2m17 = SseVector::apply_rotate90(rotate, x2m17);
let y00 = SseVector::add(y00, x2p17);
let [x3p16, x3m16] = SseVector::column_butterfly2([values[3], values[16]]);
let x3m16 = SseVector::apply_rotate90(rotate, x3m16);
let y00 = SseVector::add(y00, x3p16);
let [x4p15, x4m15] = SseVector::column_butterfly2([values[4], values[15]]);
let x4m15 = SseVector::apply_rotate90(rotate, x4m15);
let y00 = SseVector::add(y00, x4p15);
let [x5p14, x5m14] = SseVector::column_butterfly2([values[5], values[14]]);
let x5m14 = SseVector::apply_rotate90(rotate, x5m14);
let y00 = SseVector::add(y00, x5p14);
let [x6p13, x6m13] = SseVector::column_butterfly2([values[6], values[13]]);
let x6m13 = SseVector::apply_rotate90(rotate, x6m13);
let y00 = SseVector::add(y00, x6p13);
let [x7p12, x7m12] = SseVector::column_butterfly2([values[7], values[12]]);
let x7m12 = SseVector::apply_rotate90(rotate, x7m12);
let y00 = SseVector::add(y00, x7p12);
let [x8p11, x8m11] = SseVector::column_butterfly2([values[8], values[11]]);
let x8m11 = SseVector::apply_rotate90(rotate, x8m11);
let y00 = SseVector::add(y00, x8p11);
let [x9p10, x9m10] = SseVector::column_butterfly2([values[9], values[10]]);
let x9m10 = SseVector::apply_rotate90(rotate, x9m10);
let y00 = SseVector::add(y00, x9p10);
let m0118a = SseVector::fmadd(values[0], self.twiddles_re[0], x1p18);
let m0118a = SseVector::fmadd(m0118a, self.twiddles_re[1], x2p17);
let m0118a = SseVector::fmadd(m0118a, self.twiddles_re[2], x3p16);
let m0118a = SseVector::fmadd(m0118a, self.twiddles_re[3], x4p15);
let m0118a = SseVector::fmadd(m0118a, self.twiddles_re[4], x5p14);
let m0118a = SseVector::fmadd(m0118a, self.twiddles_re[5], x6p13);
let m0118a = SseVector::fmadd(m0118a, self.twiddles_re[6], x7p12);
let m0118a = SseVector::fmadd(m0118a, self.twiddles_re[7], x8p11);
let m0118a = SseVector::fmadd(m0118a, self.twiddles_re[8], x9p10);
let m0118b = SseVector::mul(self.twiddles_im[0], x1m18);
let m0118b = SseVector::fmadd(m0118b, self.twiddles_im[1], x2m17);
let m0118b = SseVector::fmadd(m0118b, self.twiddles_im[2], x3m16);
let m0118b = SseVector::fmadd(m0118b, self.twiddles_im[3], x4m15);
let m0118b = SseVector::fmadd(m0118b, self.twiddles_im[4], x5m14);
let m0118b = SseVector::fmadd(m0118b, self.twiddles_im[5], x6m13);
let m0118b = SseVector::fmadd(m0118b, self.twiddles_im[6], x7m12);
let m0118b = SseVector::fmadd(m0118b, self.twiddles_im[7], x8m11);
let m0118b = SseVector::fmadd(m0118b, self.twiddles_im[8], x9m10);
let [y01, y18] = SseVector::column_butterfly2([m0118a, m0118b]);
let m0217a = SseVector::fmadd(values[0], self.twiddles_re[1], x1p18);
let m0217a = SseVector::fmadd(m0217a, self.twiddles_re[3], x2p17);
let m0217a = SseVector::fmadd(m0217a, self.twiddles_re[5], x3p16);
let m0217a = SseVector::fmadd(m0217a, self.twiddles_re[7], x4p15);
let m0217a = SseVector::fmadd(m0217a, self.twiddles_re[8], x5p14);
let m0217a = SseVector::fmadd(m0217a, self.twiddles_re[6], x6p13);
let m0217a = SseVector::fmadd(m0217a, self.twiddles_re[4], x7p12);
let m0217a = SseVector::fmadd(m0217a, self.twiddles_re[2], x8p11);
let m0217a = SseVector::fmadd(m0217a, self.twiddles_re[0], x9p10);
let m0217b = SseVector::mul(self.twiddles_im[1], x1m18);
let m0217b = SseVector::fmadd(m0217b, self.twiddles_im[3], x2m17);
let m0217b = SseVector::fmadd(m0217b, self.twiddles_im[5], x3m16);
let m0217b = SseVector::fmadd(m0217b, self.twiddles_im[7], x4m15);
let m0217b = SseVector::nmadd(m0217b, self.twiddles_im[8], x5m14);
let m0217b = SseVector::nmadd(m0217b, self.twiddles_im[6], x6m13);
let m0217b = SseVector::nmadd(m0217b, self.twiddles_im[4], x7m12);
let m0217b = SseVector::nmadd(m0217b, self.twiddles_im[2], x8m11);
let m0217b = SseVector::nmadd(m0217b, self.twiddles_im[0], x9m10);
let [y02, y17] = SseVector::column_butterfly2([m0217a, m0217b]);
let m0316a = SseVector::fmadd(values[0], self.twiddles_re[2], x1p18);
let m0316a = SseVector::fmadd(m0316a, self.twiddles_re[5], x2p17);
let m0316a = SseVector::fmadd(m0316a, self.twiddles_re[8], x3p16);
let m0316a = SseVector::fmadd(m0316a, self.twiddles_re[6], x4p15);
let m0316a = SseVector::fmadd(m0316a, self.twiddles_re[3], x5p14);
let m0316a = SseVector::fmadd(m0316a, self.twiddles_re[0], x6p13);
let m0316a = SseVector::fmadd(m0316a, self.twiddles_re[1], x7p12);
let m0316a = SseVector::fmadd(m0316a, self.twiddles_re[4], x8p11);
let m0316a = SseVector::fmadd(m0316a, self.twiddles_re[7], x9p10);
let m0316b = SseVector::mul(self.twiddles_im[2], x1m18);
let m0316b = SseVector::fmadd(m0316b, self.twiddles_im[5], x2m17);
let m0316b = SseVector::fmadd(m0316b, self.twiddles_im[8], x3m16);
let m0316b = SseVector::nmadd(m0316b, self.twiddles_im[6], x4m15);
let m0316b = SseVector::nmadd(m0316b, self.twiddles_im[3], x5m14);
let m0316b = SseVector::nmadd(m0316b, self.twiddles_im[0], x6m13);
let m0316b = SseVector::fmadd(m0316b, self.twiddles_im[1], x7m12);
let m0316b = SseVector::fmadd(m0316b, self.twiddles_im[4], x8m11);
let m0316b = SseVector::fmadd(m0316b, self.twiddles_im[7], x9m10);
let [y03, y16] = SseVector::column_butterfly2([m0316a, m0316b]);
let m0415a = SseVector::fmadd(values[0], self.twiddles_re[3], x1p18);
let m0415a = SseVector::fmadd(m0415a, self.twiddles_re[7], x2p17);
let m0415a = SseVector::fmadd(m0415a, self.twiddles_re[6], x3p16);
let m0415a = SseVector::fmadd(m0415a, self.twiddles_re[2], x4p15);
let m0415a = SseVector::fmadd(m0415a, self.twiddles_re[0], x5p14);
let m0415a = SseVector::fmadd(m0415a, self.twiddles_re[4], x6p13);
let m0415a = SseVector::fmadd(m0415a, self.twiddles_re[8], x7p12);
let m0415a = SseVector::fmadd(m0415a, self.twiddles_re[5], x8p11);
let m0415a = SseVector::fmadd(m0415a, self.twiddles_re[1], x9p10);
let m0415b = SseVector::mul(self.twiddles_im[3], x1m18);
let m0415b = SseVector::fmadd(m0415b, self.twiddles_im[7], x2m17);
let m0415b = SseVector::nmadd(m0415b, self.twiddles_im[6], x3m16);
let m0415b = SseVector::nmadd(m0415b, self.twiddles_im[2], x4m15);
let m0415b = SseVector::fmadd(m0415b, self.twiddles_im[0], x5m14);
let m0415b = SseVector::fmadd(m0415b, self.twiddles_im[4], x6m13);
let m0415b = SseVector::fmadd(m0415b, self.twiddles_im[8], x7m12);
let m0415b = SseVector::nmadd(m0415b, self.twiddles_im[5], x8m11);
let m0415b = SseVector::nmadd(m0415b, self.twiddles_im[1], x9m10);
let [y04, y15] = SseVector::column_butterfly2([m0415a, m0415b]);
let m0514a = SseVector::fmadd(values[0], self.twiddles_re[4], x1p18);
let m0514a = SseVector::fmadd(m0514a, self.twiddles_re[8], x2p17);
let m0514a = SseVector::fmadd(m0514a, self.twiddles_re[3], x3p16);
let m0514a = SseVector::fmadd(m0514a, self.twiddles_re[0], x4p15);
let m0514a = SseVector::fmadd(m0514a, self.twiddles_re[5], x5p14);
let m0514a = SseVector::fmadd(m0514a, self.twiddles_re[7], x6p13);
let m0514a = SseVector::fmadd(m0514a, self.twiddles_re[2], x7p12);
let m0514a = SseVector::fmadd(m0514a, self.twiddles_re[1], x8p11);
let m0514a = SseVector::fmadd(m0514a, self.twiddles_re[6], x9p10);
let m0514b = SseVector::mul(self.twiddles_im[4], x1m18);
let m0514b = SseVector::nmadd(m0514b, self.twiddles_im[8], x2m17);
let m0514b = SseVector::nmadd(m0514b, self.twiddles_im[3], x3m16);
let m0514b = SseVector::fmadd(m0514b, self.twiddles_im[0], x4m15);
let m0514b = SseVector::fmadd(m0514b, self.twiddles_im[5], x5m14);
let m0514b = SseVector::nmadd(m0514b, self.twiddles_im[7], x6m13);
let m0514b = SseVector::nmadd(m0514b, self.twiddles_im[2], x7m12);
let m0514b = SseVector::fmadd(m0514b, self.twiddles_im[1], x8m11);
let m0514b = SseVector::fmadd(m0514b, self.twiddles_im[6], x9m10);
let [y05, y14] = SseVector::column_butterfly2([m0514a, m0514b]);
let m0613a = SseVector::fmadd(values[0], self.twiddles_re[5], x1p18);
let m0613a = SseVector::fmadd(m0613a, self.twiddles_re[6], x2p17);
let m0613a = SseVector::fmadd(m0613a, self.twiddles_re[0], x3p16);
let m0613a = SseVector::fmadd(m0613a, self.twiddles_re[4], x4p15);
let m0613a = SseVector::fmadd(m0613a, self.twiddles_re[7], x5p14);
let m0613a = SseVector::fmadd(m0613a, self.twiddles_re[1], x6p13);
let m0613a = SseVector::fmadd(m0613a, self.twiddles_re[3], x7p12);
let m0613a = SseVector::fmadd(m0613a, self.twiddles_re[8], x8p11);
let m0613a = SseVector::fmadd(m0613a, self.twiddles_re[2], x9p10);
let m0613b = SseVector::mul(self.twiddles_im[5], x1m18);
let m0613b = SseVector::nmadd(m0613b, self.twiddles_im[6], x2m17);
let m0613b = SseVector::nmadd(m0613b, self.twiddles_im[0], x3m16);
let m0613b = SseVector::fmadd(m0613b, self.twiddles_im[4], x4m15);
let m0613b = SseVector::nmadd(m0613b, self.twiddles_im[7], x5m14);
let m0613b = SseVector::nmadd(m0613b, self.twiddles_im[1], x6m13);
let m0613b = SseVector::fmadd(m0613b, self.twiddles_im[3], x7m12);
let m0613b = SseVector::nmadd(m0613b, self.twiddles_im[8], x8m11);
let m0613b = SseVector::nmadd(m0613b, self.twiddles_im[2], x9m10);
let [y06, y13] = SseVector::column_butterfly2([m0613a, m0613b]);
let m0712a = SseVector::fmadd(values[0], self.twiddles_re[6], x1p18);
let m0712a = SseVector::fmadd(m0712a, self.twiddles_re[4], x2p17);
let m0712a = SseVector::fmadd(m0712a, self.twiddles_re[1], x3p16);
let m0712a = SseVector::fmadd(m0712a, self.twiddles_re[8], x4p15);
let m0712a = SseVector::fmadd(m0712a, self.twiddles_re[2], x5p14);
let m0712a = SseVector::fmadd(m0712a, self.twiddles_re[3], x6p13);
let m0712a = SseVector::fmadd(m0712a, self.twiddles_re[7], x7p12);
let m0712a = SseVector::fmadd(m0712a, self.twiddles_re[0], x8p11);
let m0712a = SseVector::fmadd(m0712a, self.twiddles_re[5], x9p10);
let m0712b = SseVector::mul(self.twiddles_im[6], x1m18);
let m0712b = SseVector::nmadd(m0712b, self.twiddles_im[4], x2m17);
let m0712b = SseVector::fmadd(m0712b, self.twiddles_im[1], x3m16);
let m0712b = SseVector::fmadd(m0712b, self.twiddles_im[8], x4m15);
let m0712b = SseVector::nmadd(m0712b, self.twiddles_im[2], x5m14);
let m0712b = SseVector::fmadd(m0712b, self.twiddles_im[3], x6m13);
let m0712b = SseVector::nmadd(m0712b, self.twiddles_im[7], x7m12);
let m0712b = SseVector::nmadd(m0712b, self.twiddles_im[0], x8m11);
let m0712b = SseVector::fmadd(m0712b, self.twiddles_im[5], x9m10);
let [y07, y12] = SseVector::column_butterfly2([m0712a, m0712b]);
let m0811a = SseVector::fmadd(values[0], self.twiddles_re[7], x1p18);
let m0811a = SseVector::fmadd(m0811a, self.twiddles_re[2], x2p17);
let m0811a = SseVector::fmadd(m0811a, self.twiddles_re[4], x3p16);
let m0811a = SseVector::fmadd(m0811a, self.twiddles_re[5], x4p15);
let m0811a = SseVector::fmadd(m0811a, self.twiddles_re[1], x5p14);
let m0811a = SseVector::fmadd(m0811a, self.twiddles_re[8], x6p13);
let m0811a = SseVector::fmadd(m0811a, self.twiddles_re[0], x7p12);
let m0811a = SseVector::fmadd(m0811a, self.twiddles_re[6], x8p11);
let m0811a = SseVector::fmadd(m0811a, self.twiddles_re[3], x9p10);
let m0811b = SseVector::mul(self.twiddles_im[7], x1m18);
let m0811b = SseVector::nmadd(m0811b, self.twiddles_im[2], x2m17);
let m0811b = SseVector::fmadd(m0811b, self.twiddles_im[4], x3m16);
let m0811b = SseVector::nmadd(m0811b, self.twiddles_im[5], x4m15);
let m0811b = SseVector::fmadd(m0811b, self.twiddles_im[1], x5m14);
let m0811b = SseVector::nmadd(m0811b, self.twiddles_im[8], x6m13);
let m0811b = SseVector::nmadd(m0811b, self.twiddles_im[0], x7m12);
let m0811b = SseVector::fmadd(m0811b, self.twiddles_im[6], x8m11);
let m0811b = SseVector::nmadd(m0811b, self.twiddles_im[3], x9m10);
let [y08, y11] = SseVector::column_butterfly2([m0811a, m0811b]);
let m0910a = SseVector::fmadd(values[0], self.twiddles_re[8], x1p18);
let m0910a = SseVector::fmadd(m0910a, self.twiddles_re[0], x2p17);
let m0910a = SseVector::fmadd(m0910a, self.twiddles_re[7], x3p16);
let m0910a = SseVector::fmadd(m0910a, self.twiddles_re[1], x4p15);
let m0910a = SseVector::fmadd(m0910a, self.twiddles_re[6], x5p14);
let m0910a = SseVector::fmadd(m0910a, self.twiddles_re[2], x6p13);
let m0910a = SseVector::fmadd(m0910a, self.twiddles_re[5], x7p12);
let m0910a = SseVector::fmadd(m0910a, self.twiddles_re[3], x8p11);
let m0910a = SseVector::fmadd(m0910a, self.twiddles_re[4], x9p10);
let m0910b = SseVector::mul(self.twiddles_im[8], x1m18);
let m0910b = SseVector::nmadd(m0910b, self.twiddles_im[0], x2m17);
let m0910b = SseVector::fmadd(m0910b, self.twiddles_im[7], x3m16);
let m0910b = SseVector::nmadd(m0910b, self.twiddles_im[1], x4m15);
let m0910b = SseVector::fmadd(m0910b, self.twiddles_im[6], x5m14);
let m0910b = SseVector::nmadd(m0910b, self.twiddles_im[2], x6m13);
let m0910b = SseVector::fmadd(m0910b, self.twiddles_im[5], x7m12);
let m0910b = SseVector::nmadd(m0910b, self.twiddles_im[3], x8m11);
let m0910b = SseVector::fmadd(m0910b, self.twiddles_im[4], x9m10);
let [y09, y10] = SseVector::column_butterfly2([m0910a, m0910b]);
[y00, y01, y02, y03, y04, y05, y06, y07, y08, y09, y10, y11, y12, y13, y14, y15, y16, y17, y18]
}
}
struct SseF64Butterfly19<T> {
direction: FftDirection,
twiddles_re: [__m128d; 9],
twiddles_im: [__m128d; 9],
_phantom: std::marker::PhantomData<T>,
}
boilerplate_fft_sse_f64_butterfly!(SseF64Butterfly19, 19, |this: &SseF64Butterfly19<_>| this.direction);
impl<T: FftNum> SseF64Butterfly19<T> {
#[target_feature(enable = "sse4.1")]
unsafe fn new(direction: FftDirection) -> Self {
assert_f64::<T>();
let twiddles = make_twiddles(19, direction);
unsafe {Self {
direction,
twiddles_re: twiddles.map(|t| SseVector::broadcast_scalar(t.re)),
twiddles_im: twiddles.map(|t| SseVector::broadcast_scalar(t.im)),
_phantom: std::marker::PhantomData,
}}
}
#[inline(always)]
pub(crate) unsafe fn perform_fft_contiguous(&self, mut buffer: impl SseArrayMut<f64>) {
let values = read_complex_to_array!(buffer, { 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18 });
let out = self.perform_fft_direct(values);
write_complex_to_array!(out, buffer, { 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18 });
}
#[inline(always)]
pub(crate) unsafe fn perform_fft_direct(&self, values: [__m128d; 19]) -> [__m128d; 19] {
let rotate = SseVector::make_rotate90(FftDirection::Inverse);
let y00 = values[0];
let [x1p18, x1m18] = SseVector::column_butterfly2([values[1], values[18]]);
let x1m18 = SseVector::apply_rotate90(rotate, x1m18);
let y00 = SseVector::add(y00, x1p18);
let [x2p17, x2m17] = SseVector::column_butterfly2([values[2], values[17]]);
let x2m17 = SseVector::apply_rotate90(rotate, x2m17);
let y00 = SseVector::add(y00, x2p17);
let [x3p16, x3m16] = SseVector::column_butterfly2([values[3], values[16]]);
let x3m16 = SseVector::apply_rotate90(rotate, x3m16);
let y00 = SseVector::add(y00, x3p16);
let [x4p15, x4m15] = SseVector::column_butterfly2([values[4], values[15]]);
let x4m15 = SseVector::apply_rotate90(rotate, x4m15);
let y00 = SseVector::add(y00, x4p15);
let [x5p14, x5m14] = SseVector::column_butterfly2([values[5], values[14]]);
let x5m14 = SseVector::apply_rotate90(rotate, x5m14);
let y00 = SseVector::add(y00, x5p14);
let [x6p13, x6m13] = SseVector::column_butterfly2([values[6], values[13]]);
let x6m13 = SseVector::apply_rotate90(rotate, x6m13);
let y00 = SseVector::add(y00, x6p13);
let [x7p12, x7m12] = SseVector::column_butterfly2([values[7], values[12]]);
let x7m12 = SseVector::apply_rotate90(rotate, x7m12);
let y00 = SseVector::add(y00, x7p12);
let [x8p11, x8m11] = SseVector::column_butterfly2([values[8], values[11]]);
let x8m11 = SseVector::apply_rotate90(rotate, x8m11);
let y00 = SseVector::add(y00, x8p11);
let [x9p10, x9m10] = SseVector::column_butterfly2([values[9], values[10]]);
let x9m10 = SseVector::apply_rotate90(rotate, x9m10);
let y00 = SseVector::add(y00, x9p10);
let m0118a = SseVector::fmadd(values[0], self.twiddles_re[0], x1p18);
let m0118a = SseVector::fmadd(m0118a, self.twiddles_re[1], x2p17);
let m0118a = SseVector::fmadd(m0118a, self.twiddles_re[2], x3p16);
let m0118a = SseVector::fmadd(m0118a, self.twiddles_re[3], x4p15);
let m0118a = SseVector::fmadd(m0118a, self.twiddles_re[4], x5p14);
let m0118a = SseVector::fmadd(m0118a, self.twiddles_re[5], x6p13);
let m0118a = SseVector::fmadd(m0118a, self.twiddles_re[6], x7p12);
let m0118a = SseVector::fmadd(m0118a, self.twiddles_re[7], x8p11);
let m0118a = SseVector::fmadd(m0118a, self.twiddles_re[8], x9p10);
let m0118b = SseVector::mul(self.twiddles_im[0], x1m18);
let m0118b = SseVector::fmadd(m0118b, self.twiddles_im[1], x2m17);
let m0118b = SseVector::fmadd(m0118b, self.twiddles_im[2], x3m16);
let m0118b = SseVector::fmadd(m0118b, self.twiddles_im[3], x4m15);
let m0118b = SseVector::fmadd(m0118b, self.twiddles_im[4], x5m14);
let m0118b = SseVector::fmadd(m0118b, self.twiddles_im[5], x6m13);
let m0118b = SseVector::fmadd(m0118b, self.twiddles_im[6], x7m12);
let m0118b = SseVector::fmadd(m0118b, self.twiddles_im[7], x8m11);
let m0118b = SseVector::fmadd(m0118b, self.twiddles_im[8], x9m10);
let [y01, y18] = SseVector::column_butterfly2([m0118a, m0118b]);
let m0217a = SseVector::fmadd(values[0], self.twiddles_re[1], x1p18);
let m0217a = SseVector::fmadd(m0217a, self.twiddles_re[3], x2p17);
let m0217a = SseVector::fmadd(m0217a, self.twiddles_re[5], x3p16);
let m0217a = SseVector::fmadd(m0217a, self.twiddles_re[7], x4p15);
let m0217a = SseVector::fmadd(m0217a, self.twiddles_re[8], x5p14);
let m0217a = SseVector::fmadd(m0217a, self.twiddles_re[6], x6p13);
let m0217a = SseVector::fmadd(m0217a, self.twiddles_re[4], x7p12);
let m0217a = SseVector::fmadd(m0217a, self.twiddles_re[2], x8p11);
let m0217a = SseVector::fmadd(m0217a, self.twiddles_re[0], x9p10);
let m0217b = SseVector::mul(self.twiddles_im[1], x1m18);
let m0217b = SseVector::fmadd(m0217b, self.twiddles_im[3], x2m17);
let m0217b = SseVector::fmadd(m0217b, self.twiddles_im[5], x3m16);
let m0217b = SseVector::fmadd(m0217b, self.twiddles_im[7], x4m15);
let m0217b = SseVector::nmadd(m0217b, self.twiddles_im[8], x5m14);
let m0217b = SseVector::nmadd(m0217b, self.twiddles_im[6], x6m13);
let m0217b = SseVector::nmadd(m0217b, self.twiddles_im[4], x7m12);
let m0217b = SseVector::nmadd(m0217b, self.twiddles_im[2], x8m11);
let m0217b = SseVector::nmadd(m0217b, self.twiddles_im[0], x9m10);
let [y02, y17] = SseVector::column_butterfly2([m0217a, m0217b]);
let m0316a = SseVector::fmadd(values[0], self.twiddles_re[2], x1p18);
let m0316a = SseVector::fmadd(m0316a, self.twiddles_re[5], x2p17);
let m0316a = SseVector::fmadd(m0316a, self.twiddles_re[8], x3p16);
let m0316a = SseVector::fmadd(m0316a, self.twiddles_re[6], x4p15);
let m0316a = SseVector::fmadd(m0316a, self.twiddles_re[3], x5p14);
let m0316a = SseVector::fmadd(m0316a, self.twiddles_re[0], x6p13);
let m0316a = SseVector::fmadd(m0316a, self.twiddles_re[1], x7p12);
let m0316a = SseVector::fmadd(m0316a, self.twiddles_re[4], x8p11);
let m0316a = SseVector::fmadd(m0316a, self.twiddles_re[7], x9p10);
let m0316b = SseVector::mul(self.twiddles_im[2], x1m18);
let m0316b = SseVector::fmadd(m0316b, self.twiddles_im[5], x2m17);
let m0316b = SseVector::fmadd(m0316b, self.twiddles_im[8], x3m16);
let m0316b = SseVector::nmadd(m0316b, self.twiddles_im[6], x4m15);
let m0316b = SseVector::nmadd(m0316b, self.twiddles_im[3], x5m14);
let m0316b = SseVector::nmadd(m0316b, self.twiddles_im[0], x6m13);
let m0316b = SseVector::fmadd(m0316b, self.twiddles_im[1], x7m12);
let m0316b = SseVector::fmadd(m0316b, self.twiddles_im[4], x8m11);
let m0316b = SseVector::fmadd(m0316b, self.twiddles_im[7], x9m10);
let [y03, y16] = SseVector::column_butterfly2([m0316a, m0316b]);
let m0415a = SseVector::fmadd(values[0], self.twiddles_re[3], x1p18);
let m0415a = SseVector::fmadd(m0415a, self.twiddles_re[7], x2p17);
let m0415a = SseVector::fmadd(m0415a, self.twiddles_re[6], x3p16);
let m0415a = SseVector::fmadd(m0415a, self.twiddles_re[2], x4p15);
let m0415a = SseVector::fmadd(m0415a, self.twiddles_re[0], x5p14);
let m0415a = SseVector::fmadd(m0415a, self.twiddles_re[4], x6p13);
let m0415a = SseVector::fmadd(m0415a, self.twiddles_re[8], x7p12);
let m0415a = SseVector::fmadd(m0415a, self.twiddles_re[5], x8p11);
let m0415a = SseVector::fmadd(m0415a, self.twiddles_re[1], x9p10);
let m0415b = SseVector::mul(self.twiddles_im[3], x1m18);
let m0415b = SseVector::fmadd(m0415b, self.twiddles_im[7], x2m17);
let m0415b = SseVector::nmadd(m0415b, self.twiddles_im[6], x3m16);
let m0415b = SseVector::nmadd(m0415b, self.twiddles_im[2], x4m15);
let m0415b = SseVector::fmadd(m0415b, self.twiddles_im[0], x5m14);
let m0415b = SseVector::fmadd(m0415b, self.twiddles_im[4], x6m13);
let m0415b = SseVector::fmadd(m0415b, self.twiddles_im[8], x7m12);
let m0415b = SseVector::nmadd(m0415b, self.twiddles_im[5], x8m11);
let m0415b = SseVector::nmadd(m0415b, self.twiddles_im[1], x9m10);
let [y04, y15] = SseVector::column_butterfly2([m0415a, m0415b]);
let m0514a = SseVector::fmadd(values[0], self.twiddles_re[4], x1p18);
let m0514a = SseVector::fmadd(m0514a, self.twiddles_re[8], x2p17);
let m0514a = SseVector::fmadd(m0514a, self.twiddles_re[3], x3p16);
let m0514a = SseVector::fmadd(m0514a, self.twiddles_re[0], x4p15);
let m0514a = SseVector::fmadd(m0514a, self.twiddles_re[5], x5p14);
let m0514a = SseVector::fmadd(m0514a, self.twiddles_re[7], x6p13);
let m0514a = SseVector::fmadd(m0514a, self.twiddles_re[2], x7p12);
let m0514a = SseVector::fmadd(m0514a, self.twiddles_re[1], x8p11);
let m0514a = SseVector::fmadd(m0514a, self.twiddles_re[6], x9p10);
let m0514b = SseVector::mul(self.twiddles_im[4], x1m18);
let m0514b = SseVector::nmadd(m0514b, self.twiddles_im[8], x2m17);
let m0514b = SseVector::nmadd(m0514b, self.twiddles_im[3], x3m16);
let m0514b = SseVector::fmadd(m0514b, self.twiddles_im[0], x4m15);
let m0514b = SseVector::fmadd(m0514b, self.twiddles_im[5], x5m14);
let m0514b = SseVector::nmadd(m0514b, self.twiddles_im[7], x6m13);
let m0514b = SseVector::nmadd(m0514b, self.twiddles_im[2], x7m12);
let m0514b = SseVector::fmadd(m0514b, self.twiddles_im[1], x8m11);
let m0514b = SseVector::fmadd(m0514b, self.twiddles_im[6], x9m10);
let [y05, y14] = SseVector::column_butterfly2([m0514a, m0514b]);
let m0613a = SseVector::fmadd(values[0], self.twiddles_re[5], x1p18);
let m0613a = SseVector::fmadd(m0613a, self.twiddles_re[6], x2p17);
let m0613a = SseVector::fmadd(m0613a, self.twiddles_re[0], x3p16);
let m0613a = SseVector::fmadd(m0613a, self.twiddles_re[4], x4p15);
let m0613a = SseVector::fmadd(m0613a, self.twiddles_re[7], x5p14);
let m0613a = SseVector::fmadd(m0613a, self.twiddles_re[1], x6p13);
let m0613a = SseVector::fmadd(m0613a, self.twiddles_re[3], x7p12);
let m0613a = SseVector::fmadd(m0613a, self.twiddles_re[8], x8p11);
let m0613a = SseVector::fmadd(m0613a, self.twiddles_re[2], x9p10);
let m0613b = SseVector::mul(self.twiddles_im[5], x1m18);
let m0613b = SseVector::nmadd(m0613b, self.twiddles_im[6], x2m17);
let m0613b = SseVector::nmadd(m0613b, self.twiddles_im[0], x3m16);
let m0613b = SseVector::fmadd(m0613b, self.twiddles_im[4], x4m15);
let m0613b = SseVector::nmadd(m0613b, self.twiddles_im[7], x5m14);
let m0613b = SseVector::nmadd(m0613b, self.twiddles_im[1], x6m13);
let m0613b = SseVector::fmadd(m0613b, self.twiddles_im[3], x7m12);
let m0613b = SseVector::nmadd(m0613b, self.twiddles_im[8], x8m11);
let m0613b = SseVector::nmadd(m0613b, self.twiddles_im[2], x9m10);
let [y06, y13] = SseVector::column_butterfly2([m0613a, m0613b]);
let m0712a = SseVector::fmadd(values[0], self.twiddles_re[6], x1p18);
let m0712a = SseVector::fmadd(m0712a, self.twiddles_re[4], x2p17);
let m0712a = SseVector::fmadd(m0712a, self.twiddles_re[1], x3p16);
let m0712a = SseVector::fmadd(m0712a, self.twiddles_re[8], x4p15);
let m0712a = SseVector::fmadd(m0712a, self.twiddles_re[2], x5p14);
let m0712a = SseVector::fmadd(m0712a, self.twiddles_re[3], x6p13);
let m0712a = SseVector::fmadd(m0712a, self.twiddles_re[7], x7p12);
let m0712a = SseVector::fmadd(m0712a, self.twiddles_re[0], x8p11);
let m0712a = SseVector::fmadd(m0712a, self.twiddles_re[5], x9p10);
let m0712b = SseVector::mul(self.twiddles_im[6], x1m18);
let m0712b = SseVector::nmadd(m0712b, self.twiddles_im[4], x2m17);
let m0712b = SseVector::fmadd(m0712b, self.twiddles_im[1], x3m16);
let m0712b = SseVector::fmadd(m0712b, self.twiddles_im[8], x4m15);
let m0712b = SseVector::nmadd(m0712b, self.twiddles_im[2], x5m14);
let m0712b = SseVector::fmadd(m0712b, self.twiddles_im[3], x6m13);
let m0712b = SseVector::nmadd(m0712b, self.twiddles_im[7], x7m12);
let m0712b = SseVector::nmadd(m0712b, self.twiddles_im[0], x8m11);
let m0712b = SseVector::fmadd(m0712b, self.twiddles_im[5], x9m10);
let [y07, y12] = SseVector::column_butterfly2([m0712a, m0712b]);
let m0811a = SseVector::fmadd(values[0], self.twiddles_re[7], x1p18);
let m0811a = SseVector::fmadd(m0811a, self.twiddles_re[2], x2p17);
let m0811a = SseVector::fmadd(m0811a, self.twiddles_re[4], x3p16);
let m0811a = SseVector::fmadd(m0811a, self.twiddles_re[5], x4p15);
let m0811a = SseVector::fmadd(m0811a, self.twiddles_re[1], x5p14);
let m0811a = SseVector::fmadd(m0811a, self.twiddles_re[8], x6p13);
let m0811a = SseVector::fmadd(m0811a, self.twiddles_re[0], x7p12);
let m0811a = SseVector::fmadd(m0811a, self.twiddles_re[6], x8p11);
let m0811a = SseVector::fmadd(m0811a, self.twiddles_re[3], x9p10);
let m0811b = SseVector::mul(self.twiddles_im[7], x1m18);
let m0811b = SseVector::nmadd(m0811b, self.twiddles_im[2], x2m17);
let m0811b = SseVector::fmadd(m0811b, self.twiddles_im[4], x3m16);
let m0811b = SseVector::nmadd(m0811b, self.twiddles_im[5], x4m15);
let m0811b = SseVector::fmadd(m0811b, self.twiddles_im[1], x5m14);
let m0811b = SseVector::nmadd(m0811b, self.twiddles_im[8], x6m13);
let m0811b = SseVector::nmadd(m0811b, self.twiddles_im[0], x7m12);
let m0811b = SseVector::fmadd(m0811b, self.twiddles_im[6], x8m11);
let m0811b = SseVector::nmadd(m0811b, self.twiddles_im[3], x9m10);
let [y08, y11] = SseVector::column_butterfly2([m0811a, m0811b]);
let m0910a = SseVector::fmadd(values[0], self.twiddles_re[8], x1p18);
let m0910a = SseVector::fmadd(m0910a, self.twiddles_re[0], x2p17);
let m0910a = SseVector::fmadd(m0910a, self.twiddles_re[7], x3p16);
let m0910a = SseVector::fmadd(m0910a, self.twiddles_re[1], x4p15);
let m0910a = SseVector::fmadd(m0910a, self.twiddles_re[6], x5p14);
let m0910a = SseVector::fmadd(m0910a, self.twiddles_re[2], x6p13);
let m0910a = SseVector::fmadd(m0910a, self.twiddles_re[5], x7p12);
let m0910a = SseVector::fmadd(m0910a, self.twiddles_re[3], x8p11);
let m0910a = SseVector::fmadd(m0910a, self.twiddles_re[4], x9p10);
let m0910b = SseVector::mul(self.twiddles_im[8], x1m18);
let m0910b = SseVector::nmadd(m0910b, self.twiddles_im[0], x2m17);
let m0910b = SseVector::fmadd(m0910b, self.twiddles_im[7], x3m16);
let m0910b = SseVector::nmadd(m0910b, self.twiddles_im[1], x4m15);
let m0910b = SseVector::fmadd(m0910b, self.twiddles_im[6], x5m14);
let m0910b = SseVector::nmadd(m0910b, self.twiddles_im[2], x6m13);
let m0910b = SseVector::fmadd(m0910b, self.twiddles_im[5], x7m12);
let m0910b = SseVector::nmadd(m0910b, self.twiddles_im[3], x8m11);
let m0910b = SseVector::fmadd(m0910b, self.twiddles_im[4], x9m10);
let [y09, y10] = SseVector::column_butterfly2([m0910a, m0910b]);
[y00, y01, y02, y03, y04, y05, y06, y07, y08, y09, y10, y11, y12, y13, y14, y15, y16, y17, y18]
}
}
struct SseF32Butterfly23<T> {
direction: FftDirection,
twiddles_re: [__m128; 11],
twiddles_im: [__m128; 11],
_phantom: std::marker::PhantomData<T>,
}
boilerplate_fft_sse_f32_butterfly!(SseF32Butterfly23, 23, |this: &SseF32Butterfly23<_>| this.direction);
impl<T: FftNum> SseF32Butterfly23<T> {
#[target_feature(enable = "sse4.1")]
unsafe fn new(direction: FftDirection) -> Self {
assert_f32::<T>();
let twiddles = make_twiddles(23, direction);
Self {
direction,
twiddles_re: twiddles.map(|t| SseVector::broadcast_scalar(t.re)),
twiddles_im: twiddles.map(|t| SseVector::broadcast_scalar(t.im)),
_phantom: std::marker::PhantomData,
}
}
#[inline(always)]
pub(crate) unsafe fn perform_fft_contiguous(&self, mut buffer: impl SseArrayMut<f32>) {
let values = read_partial1_complex_to_array!(buffer, { 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22 });
let out = self.perform_parallel_fft_direct(values);
write_partial_lo_complex_to_array!(out, buffer, { 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22 } );
}
#[inline(always)]
pub(crate) unsafe fn perform_parallel_fft_contiguous(&self, mut buffer: impl SseArrayMut<f32>) {
let input_packed = read_complex_to_array!(buffer, { 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44 });
let values = [
extract_lo_hi_f32(input_packed[0], input_packed[11]),
extract_hi_lo_f32(input_packed[0], input_packed[12]),
extract_lo_hi_f32(input_packed[1], input_packed[12]),
extract_hi_lo_f32(input_packed[1], input_packed[13]),
extract_lo_hi_f32(input_packed[2], input_packed[13]),
extract_hi_lo_f32(input_packed[2], input_packed[14]),
extract_lo_hi_f32(input_packed[3], input_packed[14]),
extract_hi_lo_f32(input_packed[3], input_packed[15]),
extract_lo_hi_f32(input_packed[4], input_packed[15]),
extract_hi_lo_f32(input_packed[4], input_packed[16]),
extract_lo_hi_f32(input_packed[5], input_packed[16]),
extract_hi_lo_f32(input_packed[5], input_packed[17]),
extract_lo_hi_f32(input_packed[6], input_packed[17]),
extract_hi_lo_f32(input_packed[6], input_packed[18]),
extract_lo_hi_f32(input_packed[7], input_packed[18]),
extract_hi_lo_f32(input_packed[7], input_packed[19]),
extract_lo_hi_f32(input_packed[8], input_packed[19]),
extract_hi_lo_f32(input_packed[8], input_packed[20]),
extract_lo_hi_f32(input_packed[9], input_packed[20]),
extract_hi_lo_f32(input_packed[9], input_packed[21]),
extract_lo_hi_f32(input_packed[10], input_packed[21]),
extract_hi_lo_f32(input_packed[10], input_packed[22]),
extract_lo_hi_f32(input_packed[11], input_packed[22]),
];
let out = self.perform_parallel_fft_direct(values);
let out_packed = [
extract_lo_lo_f32(out[0], out[1]),
extract_lo_lo_f32(out[2], out[3]),
extract_lo_lo_f32(out[4], out[5]),
extract_lo_lo_f32(out[6], out[7]),
extract_lo_lo_f32(out[8], out[9]),
extract_lo_lo_f32(out[10], out[11]),
extract_lo_lo_f32(out[12], out[13]),
extract_lo_lo_f32(out[14], out[15]),
extract_lo_lo_f32(out[16], out[17]),
extract_lo_lo_f32(out[18], out[19]),
extract_lo_lo_f32(out[20], out[21]),
extract_lo_hi_f32(out[22], out[0]),
extract_hi_hi_f32(out[1], out[2]),
extract_hi_hi_f32(out[3], out[4]),
extract_hi_hi_f32(out[5], out[6]),
extract_hi_hi_f32(out[7], out[8]),
extract_hi_hi_f32(out[9], out[10]),
extract_hi_hi_f32(out[11], out[12]),
extract_hi_hi_f32(out[13], out[14]),
extract_hi_hi_f32(out[15], out[16]),
extract_hi_hi_f32(out[17], out[18]),
extract_hi_hi_f32(out[19], out[20]),
extract_hi_hi_f32(out[21], out[22]),
];
write_complex_to_array_strided!(out_packed, buffer, 2, { 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22 });
}
#[inline(always)]
pub(crate) unsafe fn perform_parallel_fft_direct(&self, values: [__m128; 23]) -> [__m128; 23] {
let rotate = SseVector::make_rotate90(FftDirection::Inverse);
let y00 = values[0];
let [x1p22, x1m22] = SseVector::column_butterfly2([values[1], values[22]]);
let x1m22 = SseVector::apply_rotate90(rotate, x1m22);
let y00 = SseVector::add(y00, x1p22);
let [x2p21, x2m21] = SseVector::column_butterfly2([values[2], values[21]]);
let x2m21 = SseVector::apply_rotate90(rotate, x2m21);
let y00 = SseVector::add(y00, x2p21);
let [x3p20, x3m20] = SseVector::column_butterfly2([values[3], values[20]]);
let x3m20 = SseVector::apply_rotate90(rotate, x3m20);
let y00 = SseVector::add(y00, x3p20);
let [x4p19, x4m19] = SseVector::column_butterfly2([values[4], values[19]]);
let x4m19 = SseVector::apply_rotate90(rotate, x4m19);
let y00 = SseVector::add(y00, x4p19);
let [x5p18, x5m18] = SseVector::column_butterfly2([values[5], values[18]]);
let x5m18 = SseVector::apply_rotate90(rotate, x5m18);
let y00 = SseVector::add(y00, x5p18);
let [x6p17, x6m17] = SseVector::column_butterfly2([values[6], values[17]]);
let x6m17 = SseVector::apply_rotate90(rotate, x6m17);
let y00 = SseVector::add(y00, x6p17);
let [x7p16, x7m16] = SseVector::column_butterfly2([values[7], values[16]]);
let x7m16 = SseVector::apply_rotate90(rotate, x7m16);
let y00 = SseVector::add(y00, x7p16);
let [x8p15, x8m15] = SseVector::column_butterfly2([values[8], values[15]]);
let x8m15 = SseVector::apply_rotate90(rotate, x8m15);
let y00 = SseVector::add(y00, x8p15);
let [x9p14, x9m14] = SseVector::column_butterfly2([values[9], values[14]]);
let x9m14 = SseVector::apply_rotate90(rotate, x9m14);
let y00 = SseVector::add(y00, x9p14);
let [x10p13, x10m13] = SseVector::column_butterfly2([values[10], values[13]]);
let x10m13 = SseVector::apply_rotate90(rotate, x10m13);
let y00 = SseVector::add(y00, x10p13);
let [x11p12, x11m12] = SseVector::column_butterfly2([values[11], values[12]]);
let x11m12 = SseVector::apply_rotate90(rotate, x11m12);
let y00 = SseVector::add(y00, x11p12);
let m0122a = SseVector::fmadd(values[0], self.twiddles_re[0], x1p22);
let m0122a = SseVector::fmadd(m0122a, self.twiddles_re[1], x2p21);
let m0122a = SseVector::fmadd(m0122a, self.twiddles_re[2], x3p20);
let m0122a = SseVector::fmadd(m0122a, self.twiddles_re[3], x4p19);
let m0122a = SseVector::fmadd(m0122a, self.twiddles_re[4], x5p18);
let m0122a = SseVector::fmadd(m0122a, self.twiddles_re[5], x6p17);
let m0122a = SseVector::fmadd(m0122a, self.twiddles_re[6], x7p16);
let m0122a = SseVector::fmadd(m0122a, self.twiddles_re[7], x8p15);
let m0122a = SseVector::fmadd(m0122a, self.twiddles_re[8], x9p14);
let m0122a = SseVector::fmadd(m0122a, self.twiddles_re[9], x10p13);
let m0122a = SseVector::fmadd(m0122a, self.twiddles_re[10], x11p12);
let m0122b = SseVector::mul(self.twiddles_im[0], x1m22);
let m0122b = SseVector::fmadd(m0122b, self.twiddles_im[1], x2m21);
let m0122b = SseVector::fmadd(m0122b, self.twiddles_im[2], x3m20);
let m0122b = SseVector::fmadd(m0122b, self.twiddles_im[3], x4m19);
let m0122b = SseVector::fmadd(m0122b, self.twiddles_im[4], x5m18);
let m0122b = SseVector::fmadd(m0122b, self.twiddles_im[5], x6m17);
let m0122b = SseVector::fmadd(m0122b, self.twiddles_im[6], x7m16);
let m0122b = SseVector::fmadd(m0122b, self.twiddles_im[7], x8m15);
let m0122b = SseVector::fmadd(m0122b, self.twiddles_im[8], x9m14);
let m0122b = SseVector::fmadd(m0122b, self.twiddles_im[9], x10m13);
let m0122b = SseVector::fmadd(m0122b, self.twiddles_im[10], x11m12);
let [y01, y22] = SseVector::column_butterfly2([m0122a, m0122b]);
let m0221a = SseVector::fmadd(values[0], self.twiddles_re[1], x1p22);
let m0221a = SseVector::fmadd(m0221a, self.twiddles_re[3], x2p21);
let m0221a = SseVector::fmadd(m0221a, self.twiddles_re[5], x3p20);
let m0221a = SseVector::fmadd(m0221a, self.twiddles_re[7], x4p19);
let m0221a = SseVector::fmadd(m0221a, self.twiddles_re[9], x5p18);
let m0221a = SseVector::fmadd(m0221a, self.twiddles_re[10], x6p17);
let m0221a = SseVector::fmadd(m0221a, self.twiddles_re[8], x7p16);
let m0221a = SseVector::fmadd(m0221a, self.twiddles_re[6], x8p15);
let m0221a = SseVector::fmadd(m0221a, self.twiddles_re[4], x9p14);
let m0221a = SseVector::fmadd(m0221a, self.twiddles_re[2], x10p13);
let m0221a = SseVector::fmadd(m0221a, self.twiddles_re[0], x11p12);
let m0221b = SseVector::mul(self.twiddles_im[1], x1m22);
let m0221b = SseVector::fmadd(m0221b, self.twiddles_im[3], x2m21);
let m0221b = SseVector::fmadd(m0221b, self.twiddles_im[5], x3m20);
let m0221b = SseVector::fmadd(m0221b, self.twiddles_im[7], x4m19);
let m0221b = SseVector::fmadd(m0221b, self.twiddles_im[9], x5m18);
let m0221b = SseVector::nmadd(m0221b, self.twiddles_im[10], x6m17);
let m0221b = SseVector::nmadd(m0221b, self.twiddles_im[8], x7m16);
let m0221b = SseVector::nmadd(m0221b, self.twiddles_im[6], x8m15);
let m0221b = SseVector::nmadd(m0221b, self.twiddles_im[4], x9m14);
let m0221b = SseVector::nmadd(m0221b, self.twiddles_im[2], x10m13);
let m0221b = SseVector::nmadd(m0221b, self.twiddles_im[0], x11m12);
let [y02, y21] = SseVector::column_butterfly2([m0221a, m0221b]);
let m0320a = SseVector::fmadd(values[0], self.twiddles_re[2], x1p22);
let m0320a = SseVector::fmadd(m0320a, self.twiddles_re[5], x2p21);
let m0320a = SseVector::fmadd(m0320a, self.twiddles_re[8], x3p20);
let m0320a = SseVector::fmadd(m0320a, self.twiddles_re[10], x4p19);
let m0320a = SseVector::fmadd(m0320a, self.twiddles_re[7], x5p18);
let m0320a = SseVector::fmadd(m0320a, self.twiddles_re[4], x6p17);
let m0320a = SseVector::fmadd(m0320a, self.twiddles_re[1], x7p16);
let m0320a = SseVector::fmadd(m0320a, self.twiddles_re[0], x8p15);
let m0320a = SseVector::fmadd(m0320a, self.twiddles_re[3], x9p14);
let m0320a = SseVector::fmadd(m0320a, self.twiddles_re[6], x10p13);
let m0320a = SseVector::fmadd(m0320a, self.twiddles_re[9], x11p12);
let m0320b = SseVector::mul(self.twiddles_im[2], x1m22);
let m0320b = SseVector::fmadd(m0320b, self.twiddles_im[5], x2m21);
let m0320b = SseVector::fmadd(m0320b, self.twiddles_im[8], x3m20);
let m0320b = SseVector::nmadd(m0320b, self.twiddles_im[10], x4m19);
let m0320b = SseVector::nmadd(m0320b, self.twiddles_im[7], x5m18);
let m0320b = SseVector::nmadd(m0320b, self.twiddles_im[4], x6m17);
let m0320b = SseVector::nmadd(m0320b, self.twiddles_im[1], x7m16);
let m0320b = SseVector::fmadd(m0320b, self.twiddles_im[0], x8m15);
let m0320b = SseVector::fmadd(m0320b, self.twiddles_im[3], x9m14);
let m0320b = SseVector::fmadd(m0320b, self.twiddles_im[6], x10m13);
let m0320b = SseVector::fmadd(m0320b, self.twiddles_im[9], x11m12);
let [y03, y20] = SseVector::column_butterfly2([m0320a, m0320b]);
let m0419a = SseVector::fmadd(values[0], self.twiddles_re[3], x1p22);
let m0419a = SseVector::fmadd(m0419a, self.twiddles_re[7], x2p21);
let m0419a = SseVector::fmadd(m0419a, self.twiddles_re[10], x3p20);
let m0419a = SseVector::fmadd(m0419a, self.twiddles_re[6], x4p19);
let m0419a = SseVector::fmadd(m0419a, self.twiddles_re[2], x5p18);
let m0419a = SseVector::fmadd(m0419a, self.twiddles_re[0], x6p17);
let m0419a = SseVector::fmadd(m0419a, self.twiddles_re[4], x7p16);
let m0419a = SseVector::fmadd(m0419a, self.twiddles_re[8], x8p15);
let m0419a = SseVector::fmadd(m0419a, self.twiddles_re[9], x9p14);
let m0419a = SseVector::fmadd(m0419a, self.twiddles_re[5], x10p13);
let m0419a = SseVector::fmadd(m0419a, self.twiddles_re[1], x11p12);
let m0419b = SseVector::mul(self.twiddles_im[3], x1m22);
let m0419b = SseVector::fmadd(m0419b, self.twiddles_im[7], x2m21);
let m0419b = SseVector::nmadd(m0419b, self.twiddles_im[10], x3m20);
let m0419b = SseVector::nmadd(m0419b, self.twiddles_im[6], x4m19);
let m0419b = SseVector::nmadd(m0419b, self.twiddles_im[2], x5m18);
let m0419b = SseVector::fmadd(m0419b, self.twiddles_im[0], x6m17);
let m0419b = SseVector::fmadd(m0419b, self.twiddles_im[4], x7m16);
let m0419b = SseVector::fmadd(m0419b, self.twiddles_im[8], x8m15);
let m0419b = SseVector::nmadd(m0419b, self.twiddles_im[9], x9m14);
let m0419b = SseVector::nmadd(m0419b, self.twiddles_im[5], x10m13);
let m0419b = SseVector::nmadd(m0419b, self.twiddles_im[1], x11m12);
let [y04, y19] = SseVector::column_butterfly2([m0419a, m0419b]);
let m0518a = SseVector::fmadd(values[0], self.twiddles_re[4], x1p22);
let m0518a = SseVector::fmadd(m0518a, self.twiddles_re[9], x2p21);
let m0518a = SseVector::fmadd(m0518a, self.twiddles_re[7], x3p20);
let m0518a = SseVector::fmadd(m0518a, self.twiddles_re[2], x4p19);
let m0518a = SseVector::fmadd(m0518a, self.twiddles_re[1], x5p18);
let m0518a = SseVector::fmadd(m0518a, self.twiddles_re[6], x6p17);
let m0518a = SseVector::fmadd(m0518a, self.twiddles_re[10], x7p16);
let m0518a = SseVector::fmadd(m0518a, self.twiddles_re[5], x8p15);
let m0518a = SseVector::fmadd(m0518a, self.twiddles_re[0], x9p14);
let m0518a = SseVector::fmadd(m0518a, self.twiddles_re[3], x10p13);
let m0518a = SseVector::fmadd(m0518a, self.twiddles_re[8], x11p12);
let m0518b = SseVector::mul(self.twiddles_im[4], x1m22);
let m0518b = SseVector::fmadd(m0518b, self.twiddles_im[9], x2m21);
let m0518b = SseVector::nmadd(m0518b, self.twiddles_im[7], x3m20);
let m0518b = SseVector::nmadd(m0518b, self.twiddles_im[2], x4m19);
let m0518b = SseVector::fmadd(m0518b, self.twiddles_im[1], x5m18);
let m0518b = SseVector::fmadd(m0518b, self.twiddles_im[6], x6m17);
let m0518b = SseVector::nmadd(m0518b, self.twiddles_im[10], x7m16);
let m0518b = SseVector::nmadd(m0518b, self.twiddles_im[5], x8m15);
let m0518b = SseVector::nmadd(m0518b, self.twiddles_im[0], x9m14);
let m0518b = SseVector::fmadd(m0518b, self.twiddles_im[3], x10m13);
let m0518b = SseVector::fmadd(m0518b, self.twiddles_im[8], x11m12);
let [y05, y18] = SseVector::column_butterfly2([m0518a, m0518b]);
let m0617a = SseVector::fmadd(values[0], self.twiddles_re[5], x1p22);
let m0617a = SseVector::fmadd(m0617a, self.twiddles_re[10], x2p21);
let m0617a = SseVector::fmadd(m0617a, self.twiddles_re[4], x3p20);
let m0617a = SseVector::fmadd(m0617a, self.twiddles_re[0], x4p19);
let m0617a = SseVector::fmadd(m0617a, self.twiddles_re[6], x5p18);
let m0617a = SseVector::fmadd(m0617a, self.twiddles_re[9], x6p17);
let m0617a = SseVector::fmadd(m0617a, self.twiddles_re[3], x7p16);
let m0617a = SseVector::fmadd(m0617a, self.twiddles_re[1], x8p15);
let m0617a = SseVector::fmadd(m0617a, self.twiddles_re[7], x9p14);
let m0617a = SseVector::fmadd(m0617a, self.twiddles_re[8], x10p13);
let m0617a = SseVector::fmadd(m0617a, self.twiddles_re[2], x11p12);
let m0617b = SseVector::mul(self.twiddles_im[5], x1m22);
let m0617b = SseVector::nmadd(m0617b, self.twiddles_im[10], x2m21);
let m0617b = SseVector::nmadd(m0617b, self.twiddles_im[4], x3m20);
let m0617b = SseVector::fmadd(m0617b, self.twiddles_im[0], x4m19);
let m0617b = SseVector::fmadd(m0617b, self.twiddles_im[6], x5m18);
let m0617b = SseVector::nmadd(m0617b, self.twiddles_im[9], x6m17);
let m0617b = SseVector::nmadd(m0617b, self.twiddles_im[3], x7m16);
let m0617b = SseVector::fmadd(m0617b, self.twiddles_im[1], x8m15);
let m0617b = SseVector::fmadd(m0617b, self.twiddles_im[7], x9m14);
let m0617b = SseVector::nmadd(m0617b, self.twiddles_im[8], x10m13);
let m0617b = SseVector::nmadd(m0617b, self.twiddles_im[2], x11m12);
let [y06, y17] = SseVector::column_butterfly2([m0617a, m0617b]);
let m0716a = SseVector::fmadd(values[0], self.twiddles_re[6], x1p22);
let m0716a = SseVector::fmadd(m0716a, self.twiddles_re[8], x2p21);
let m0716a = SseVector::fmadd(m0716a, self.twiddles_re[1], x3p20);
let m0716a = SseVector::fmadd(m0716a, self.twiddles_re[4], x4p19);
let m0716a = SseVector::fmadd(m0716a, self.twiddles_re[10], x5p18);
let m0716a = SseVector::fmadd(m0716a, self.twiddles_re[3], x6p17);
let m0716a = SseVector::fmadd(m0716a, self.twiddles_re[2], x7p16);
let m0716a = SseVector::fmadd(m0716a, self.twiddles_re[9], x8p15);
let m0716a = SseVector::fmadd(m0716a, self.twiddles_re[5], x9p14);
let m0716a = SseVector::fmadd(m0716a, self.twiddles_re[0], x10p13);
let m0716a = SseVector::fmadd(m0716a, self.twiddles_re[7], x11p12);
let m0716b = SseVector::mul(self.twiddles_im[6], x1m22);
let m0716b = SseVector::nmadd(m0716b, self.twiddles_im[8], x2m21);
let m0716b = SseVector::nmadd(m0716b, self.twiddles_im[1], x3m20);
let m0716b = SseVector::fmadd(m0716b, self.twiddles_im[4], x4m19);
let m0716b = SseVector::nmadd(m0716b, self.twiddles_im[10], x5m18);
let m0716b = SseVector::nmadd(m0716b, self.twiddles_im[3], x6m17);
let m0716b = SseVector::fmadd(m0716b, self.twiddles_im[2], x7m16);
let m0716b = SseVector::fmadd(m0716b, self.twiddles_im[9], x8m15);
let m0716b = SseVector::nmadd(m0716b, self.twiddles_im[5], x9m14);
let m0716b = SseVector::fmadd(m0716b, self.twiddles_im[0], x10m13);
let m0716b = SseVector::fmadd(m0716b, self.twiddles_im[7], x11m12);
let [y07, y16] = SseVector::column_butterfly2([m0716a, m0716b]);
let m0815a = SseVector::fmadd(values[0], self.twiddles_re[7], x1p22);
let m0815a = SseVector::fmadd(m0815a, self.twiddles_re[6], x2p21);
let m0815a = SseVector::fmadd(m0815a, self.twiddles_re[0], x3p20);
let m0815a = SseVector::fmadd(m0815a, self.twiddles_re[8], x4p19);
let m0815a = SseVector::fmadd(m0815a, self.twiddles_re[5], x5p18);
let m0815a = SseVector::fmadd(m0815a, self.twiddles_re[1], x6p17);
let m0815a = SseVector::fmadd(m0815a, self.twiddles_re[9], x7p16);
let m0815a = SseVector::fmadd(m0815a, self.twiddles_re[4], x8p15);
let m0815a = SseVector::fmadd(m0815a, self.twiddles_re[2], x9p14);
let m0815a = SseVector::fmadd(m0815a, self.twiddles_re[10], x10p13);
let m0815a = SseVector::fmadd(m0815a, self.twiddles_re[3], x11p12);
let m0815b = SseVector::mul(self.twiddles_im[7], x1m22);
let m0815b = SseVector::nmadd(m0815b, self.twiddles_im[6], x2m21);
let m0815b = SseVector::fmadd(m0815b, self.twiddles_im[0], x3m20);
let m0815b = SseVector::fmadd(m0815b, self.twiddles_im[8], x4m19);
let m0815b = SseVector::nmadd(m0815b, self.twiddles_im[5], x5m18);
let m0815b = SseVector::fmadd(m0815b, self.twiddles_im[1], x6m17);
let m0815b = SseVector::fmadd(m0815b, self.twiddles_im[9], x7m16);
let m0815b = SseVector::nmadd(m0815b, self.twiddles_im[4], x8m15);
let m0815b = SseVector::fmadd(m0815b, self.twiddles_im[2], x9m14);
let m0815b = SseVector::fmadd(m0815b, self.twiddles_im[10], x10m13);
let m0815b = SseVector::nmadd(m0815b, self.twiddles_im[3], x11m12);
let [y08, y15] = SseVector::column_butterfly2([m0815a, m0815b]);
let m0914a = SseVector::fmadd(values[0], self.twiddles_re[8], x1p22);
let m0914a = SseVector::fmadd(m0914a, self.twiddles_re[4], x2p21);
let m0914a = SseVector::fmadd(m0914a, self.twiddles_re[3], x3p20);
let m0914a = SseVector::fmadd(m0914a, self.twiddles_re[9], x4p19);
let m0914a = SseVector::fmadd(m0914a, self.twiddles_re[0], x5p18);
let m0914a = SseVector::fmadd(m0914a, self.twiddles_re[7], x6p17);
let m0914a = SseVector::fmadd(m0914a, self.twiddles_re[5], x7p16);
let m0914a = SseVector::fmadd(m0914a, self.twiddles_re[2], x8p15);
let m0914a = SseVector::fmadd(m0914a, self.twiddles_re[10], x9p14);
let m0914a = SseVector::fmadd(m0914a, self.twiddles_re[1], x10p13);
let m0914a = SseVector::fmadd(m0914a, self.twiddles_re[6], x11p12);
let m0914b = SseVector::mul(self.twiddles_im[8], x1m22);
let m0914b = SseVector::nmadd(m0914b, self.twiddles_im[4], x2m21);
let m0914b = SseVector::fmadd(m0914b, self.twiddles_im[3], x3m20);
let m0914b = SseVector::nmadd(m0914b, self.twiddles_im[9], x4m19);
let m0914b = SseVector::nmadd(m0914b, self.twiddles_im[0], x5m18);
let m0914b = SseVector::fmadd(m0914b, self.twiddles_im[7], x6m17);
let m0914b = SseVector::nmadd(m0914b, self.twiddles_im[5], x7m16);
let m0914b = SseVector::fmadd(m0914b, self.twiddles_im[2], x8m15);
let m0914b = SseVector::nmadd(m0914b, self.twiddles_im[10], x9m14);
let m0914b = SseVector::nmadd(m0914b, self.twiddles_im[1], x10m13);
let m0914b = SseVector::fmadd(m0914b, self.twiddles_im[6], x11m12);
let [y09, y14] = SseVector::column_butterfly2([m0914a, m0914b]);
let m1013a = SseVector::fmadd(values[0], self.twiddles_re[9], x1p22);
let m1013a = SseVector::fmadd(m1013a, self.twiddles_re[2], x2p21);
let m1013a = SseVector::fmadd(m1013a, self.twiddles_re[6], x3p20);
let m1013a = SseVector::fmadd(m1013a, self.twiddles_re[5], x4p19);
let m1013a = SseVector::fmadd(m1013a, self.twiddles_re[3], x5p18);
let m1013a = SseVector::fmadd(m1013a, self.twiddles_re[8], x6p17);
let m1013a = SseVector::fmadd(m1013a, self.twiddles_re[0], x7p16);
let m1013a = SseVector::fmadd(m1013a, self.twiddles_re[10], x8p15);
let m1013a = SseVector::fmadd(m1013a, self.twiddles_re[1], x9p14);
let m1013a = SseVector::fmadd(m1013a, self.twiddles_re[7], x10p13);
let m1013a = SseVector::fmadd(m1013a, self.twiddles_re[4], x11p12);
let m1013b = SseVector::mul(self.twiddles_im[9], x1m22);
let m1013b = SseVector::nmadd(m1013b, self.twiddles_im[2], x2m21);
let m1013b = SseVector::fmadd(m1013b, self.twiddles_im[6], x3m20);
let m1013b = SseVector::nmadd(m1013b, self.twiddles_im[5], x4m19);
let m1013b = SseVector::fmadd(m1013b, self.twiddles_im[3], x5m18);
let m1013b = SseVector::nmadd(m1013b, self.twiddles_im[8], x6m17);
let m1013b = SseVector::fmadd(m1013b, self.twiddles_im[0], x7m16);
let m1013b = SseVector::fmadd(m1013b, self.twiddles_im[10], x8m15);
let m1013b = SseVector::nmadd(m1013b, self.twiddles_im[1], x9m14);
let m1013b = SseVector::fmadd(m1013b, self.twiddles_im[7], x10m13);
let m1013b = SseVector::nmadd(m1013b, self.twiddles_im[4], x11m12);
let [y10, y13] = SseVector::column_butterfly2([m1013a, m1013b]);
let m1112a = SseVector::fmadd(values[0], self.twiddles_re[10], x1p22);
let m1112a = SseVector::fmadd(m1112a, self.twiddles_re[0], x2p21);
let m1112a = SseVector::fmadd(m1112a, self.twiddles_re[9], x3p20);
let m1112a = SseVector::fmadd(m1112a, self.twiddles_re[1], x4p19);
let m1112a = SseVector::fmadd(m1112a, self.twiddles_re[8], x5p18);
let m1112a = SseVector::fmadd(m1112a, self.twiddles_re[2], x6p17);
let m1112a = SseVector::fmadd(m1112a, self.twiddles_re[7], x7p16);
let m1112a = SseVector::fmadd(m1112a, self.twiddles_re[3], x8p15);
let m1112a = SseVector::fmadd(m1112a, self.twiddles_re[6], x9p14);
let m1112a = SseVector::fmadd(m1112a, self.twiddles_re[4], x10p13);
let m1112a = SseVector::fmadd(m1112a, self.twiddles_re[5], x11p12);
let m1112b = SseVector::mul(self.twiddles_im[10], x1m22);
let m1112b = SseVector::nmadd(m1112b, self.twiddles_im[0], x2m21);
let m1112b = SseVector::fmadd(m1112b, self.twiddles_im[9], x3m20);
let m1112b = SseVector::nmadd(m1112b, self.twiddles_im[1], x4m19);
let m1112b = SseVector::fmadd(m1112b, self.twiddles_im[8], x5m18);
let m1112b = SseVector::nmadd(m1112b, self.twiddles_im[2], x6m17);
let m1112b = SseVector::fmadd(m1112b, self.twiddles_im[7], x7m16);
let m1112b = SseVector::nmadd(m1112b, self.twiddles_im[3], x8m15);
let m1112b = SseVector::fmadd(m1112b, self.twiddles_im[6], x9m14);
let m1112b = SseVector::nmadd(m1112b, self.twiddles_im[4], x10m13);
let m1112b = SseVector::fmadd(m1112b, self.twiddles_im[5], x11m12);
let [y11, y12] = SseVector::column_butterfly2([m1112a, m1112b]);
[y00, y01, y02, y03, y04, y05, y06, y07, y08, y09, y10, y11, y12, y13, y14, y15, y16, y17, y18, y19, y20, y21, y22]
}
}
struct SseF64Butterfly23<T> {
direction: FftDirection,
twiddles_re: [__m128d; 11],
twiddles_im: [__m128d; 11],
_phantom: std::marker::PhantomData<T>,
}
boilerplate_fft_sse_f64_butterfly!(SseF64Butterfly23, 23, |this: &SseF64Butterfly23<_>| this.direction);
impl<T: FftNum> SseF64Butterfly23<T> {
#[target_feature(enable = "sse4.1")]
unsafe fn new(direction: FftDirection) -> Self {
assert_f64::<T>();
let twiddles = make_twiddles(23, direction);
unsafe {Self {
direction,
twiddles_re: twiddles.map(|t| SseVector::broadcast_scalar(t.re)),
twiddles_im: twiddles.map(|t| SseVector::broadcast_scalar(t.im)),
_phantom: std::marker::PhantomData,
}}
}
#[inline(always)]
pub(crate) unsafe fn perform_fft_contiguous(&self, mut buffer: impl SseArrayMut<f64>) {
let values = read_complex_to_array!(buffer, { 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22 });
let out = self.perform_fft_direct(values);
write_complex_to_array!(out, buffer, { 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22 });
}
#[inline(always)]
pub(crate) unsafe fn perform_fft_direct(&self, values: [__m128d; 23]) -> [__m128d; 23] {
let rotate = SseVector::make_rotate90(FftDirection::Inverse);
let y00 = values[0];
let [x1p22, x1m22] = SseVector::column_butterfly2([values[1], values[22]]);
let x1m22 = SseVector::apply_rotate90(rotate, x1m22);
let y00 = SseVector::add(y00, x1p22);
let [x2p21, x2m21] = SseVector::column_butterfly2([values[2], values[21]]);
let x2m21 = SseVector::apply_rotate90(rotate, x2m21);
let y00 = SseVector::add(y00, x2p21);
let [x3p20, x3m20] = SseVector::column_butterfly2([values[3], values[20]]);
let x3m20 = SseVector::apply_rotate90(rotate, x3m20);
let y00 = SseVector::add(y00, x3p20);
let [x4p19, x4m19] = SseVector::column_butterfly2([values[4], values[19]]);
let x4m19 = SseVector::apply_rotate90(rotate, x4m19);
let y00 = SseVector::add(y00, x4p19);
let [x5p18, x5m18] = SseVector::column_butterfly2([values[5], values[18]]);
let x5m18 = SseVector::apply_rotate90(rotate, x5m18);
let y00 = SseVector::add(y00, x5p18);
let [x6p17, x6m17] = SseVector::column_butterfly2([values[6], values[17]]);
let x6m17 = SseVector::apply_rotate90(rotate, x6m17);
let y00 = SseVector::add(y00, x6p17);
let [x7p16, x7m16] = SseVector::column_butterfly2([values[7], values[16]]);
let x7m16 = SseVector::apply_rotate90(rotate, x7m16);
let y00 = SseVector::add(y00, x7p16);
let [x8p15, x8m15] = SseVector::column_butterfly2([values[8], values[15]]);
let x8m15 = SseVector::apply_rotate90(rotate, x8m15);
let y00 = SseVector::add(y00, x8p15);
let [x9p14, x9m14] = SseVector::column_butterfly2([values[9], values[14]]);
let x9m14 = SseVector::apply_rotate90(rotate, x9m14);
let y00 = SseVector::add(y00, x9p14);
let [x10p13, x10m13] = SseVector::column_butterfly2([values[10], values[13]]);
let x10m13 = SseVector::apply_rotate90(rotate, x10m13);
let y00 = SseVector::add(y00, x10p13);
let [x11p12, x11m12] = SseVector::column_butterfly2([values[11], values[12]]);
let x11m12 = SseVector::apply_rotate90(rotate, x11m12);
let y00 = SseVector::add(y00, x11p12);
let m0122a = SseVector::fmadd(values[0], self.twiddles_re[0], x1p22);
let m0122a = SseVector::fmadd(m0122a, self.twiddles_re[1], x2p21);
let m0122a = SseVector::fmadd(m0122a, self.twiddles_re[2], x3p20);
let m0122a = SseVector::fmadd(m0122a, self.twiddles_re[3], x4p19);
let m0122a = SseVector::fmadd(m0122a, self.twiddles_re[4], x5p18);
let m0122a = SseVector::fmadd(m0122a, self.twiddles_re[5], x6p17);
let m0122a = SseVector::fmadd(m0122a, self.twiddles_re[6], x7p16);
let m0122a = SseVector::fmadd(m0122a, self.twiddles_re[7], x8p15);
let m0122a = SseVector::fmadd(m0122a, self.twiddles_re[8], x9p14);
let m0122a = SseVector::fmadd(m0122a, self.twiddles_re[9], x10p13);
let m0122a = SseVector::fmadd(m0122a, self.twiddles_re[10], x11p12);
let m0122b = SseVector::mul(self.twiddles_im[0], x1m22);
let m0122b = SseVector::fmadd(m0122b, self.twiddles_im[1], x2m21);
let m0122b = SseVector::fmadd(m0122b, self.twiddles_im[2], x3m20);
let m0122b = SseVector::fmadd(m0122b, self.twiddles_im[3], x4m19);
let m0122b = SseVector::fmadd(m0122b, self.twiddles_im[4], x5m18);
let m0122b = SseVector::fmadd(m0122b, self.twiddles_im[5], x6m17);
let m0122b = SseVector::fmadd(m0122b, self.twiddles_im[6], x7m16);
let m0122b = SseVector::fmadd(m0122b, self.twiddles_im[7], x8m15);
let m0122b = SseVector::fmadd(m0122b, self.twiddles_im[8], x9m14);
let m0122b = SseVector::fmadd(m0122b, self.twiddles_im[9], x10m13);
let m0122b = SseVector::fmadd(m0122b, self.twiddles_im[10], x11m12);
let [y01, y22] = SseVector::column_butterfly2([m0122a, m0122b]);
let m0221a = SseVector::fmadd(values[0], self.twiddles_re[1], x1p22);
let m0221a = SseVector::fmadd(m0221a, self.twiddles_re[3], x2p21);
let m0221a = SseVector::fmadd(m0221a, self.twiddles_re[5], x3p20);
let m0221a = SseVector::fmadd(m0221a, self.twiddles_re[7], x4p19);
let m0221a = SseVector::fmadd(m0221a, self.twiddles_re[9], x5p18);
let m0221a = SseVector::fmadd(m0221a, self.twiddles_re[10], x6p17);
let m0221a = SseVector::fmadd(m0221a, self.twiddles_re[8], x7p16);
let m0221a = SseVector::fmadd(m0221a, self.twiddles_re[6], x8p15);
let m0221a = SseVector::fmadd(m0221a, self.twiddles_re[4], x9p14);
let m0221a = SseVector::fmadd(m0221a, self.twiddles_re[2], x10p13);
let m0221a = SseVector::fmadd(m0221a, self.twiddles_re[0], x11p12);
let m0221b = SseVector::mul(self.twiddles_im[1], x1m22);
let m0221b = SseVector::fmadd(m0221b, self.twiddles_im[3], x2m21);
let m0221b = SseVector::fmadd(m0221b, self.twiddles_im[5], x3m20);
let m0221b = SseVector::fmadd(m0221b, self.twiddles_im[7], x4m19);
let m0221b = SseVector::fmadd(m0221b, self.twiddles_im[9], x5m18);
let m0221b = SseVector::nmadd(m0221b, self.twiddles_im[10], x6m17);
let m0221b = SseVector::nmadd(m0221b, self.twiddles_im[8], x7m16);
let m0221b = SseVector::nmadd(m0221b, self.twiddles_im[6], x8m15);
let m0221b = SseVector::nmadd(m0221b, self.twiddles_im[4], x9m14);
let m0221b = SseVector::nmadd(m0221b, self.twiddles_im[2], x10m13);
let m0221b = SseVector::nmadd(m0221b, self.twiddles_im[0], x11m12);
let [y02, y21] = SseVector::column_butterfly2([m0221a, m0221b]);
let m0320a = SseVector::fmadd(values[0], self.twiddles_re[2], x1p22);
let m0320a = SseVector::fmadd(m0320a, self.twiddles_re[5], x2p21);
let m0320a = SseVector::fmadd(m0320a, self.twiddles_re[8], x3p20);
let m0320a = SseVector::fmadd(m0320a, self.twiddles_re[10], x4p19);
let m0320a = SseVector::fmadd(m0320a, self.twiddles_re[7], x5p18);
let m0320a = SseVector::fmadd(m0320a, self.twiddles_re[4], x6p17);
let m0320a = SseVector::fmadd(m0320a, self.twiddles_re[1], x7p16);
let m0320a = SseVector::fmadd(m0320a, self.twiddles_re[0], x8p15);
let m0320a = SseVector::fmadd(m0320a, self.twiddles_re[3], x9p14);
let m0320a = SseVector::fmadd(m0320a, self.twiddles_re[6], x10p13);
let m0320a = SseVector::fmadd(m0320a, self.twiddles_re[9], x11p12);
let m0320b = SseVector::mul(self.twiddles_im[2], x1m22);
let m0320b = SseVector::fmadd(m0320b, self.twiddles_im[5], x2m21);
let m0320b = SseVector::fmadd(m0320b, self.twiddles_im[8], x3m20);
let m0320b = SseVector::nmadd(m0320b, self.twiddles_im[10], x4m19);
let m0320b = SseVector::nmadd(m0320b, self.twiddles_im[7], x5m18);
let m0320b = SseVector::nmadd(m0320b, self.twiddles_im[4], x6m17);
let m0320b = SseVector::nmadd(m0320b, self.twiddles_im[1], x7m16);
let m0320b = SseVector::fmadd(m0320b, self.twiddles_im[0], x8m15);
let m0320b = SseVector::fmadd(m0320b, self.twiddles_im[3], x9m14);
let m0320b = SseVector::fmadd(m0320b, self.twiddles_im[6], x10m13);
let m0320b = SseVector::fmadd(m0320b, self.twiddles_im[9], x11m12);
let [y03, y20] = SseVector::column_butterfly2([m0320a, m0320b]);
let m0419a = SseVector::fmadd(values[0], self.twiddles_re[3], x1p22);
let m0419a = SseVector::fmadd(m0419a, self.twiddles_re[7], x2p21);
let m0419a = SseVector::fmadd(m0419a, self.twiddles_re[10], x3p20);
let m0419a = SseVector::fmadd(m0419a, self.twiddles_re[6], x4p19);
let m0419a = SseVector::fmadd(m0419a, self.twiddles_re[2], x5p18);
let m0419a = SseVector::fmadd(m0419a, self.twiddles_re[0], x6p17);
let m0419a = SseVector::fmadd(m0419a, self.twiddles_re[4], x7p16);
let m0419a = SseVector::fmadd(m0419a, self.twiddles_re[8], x8p15);
let m0419a = SseVector::fmadd(m0419a, self.twiddles_re[9], x9p14);
let m0419a = SseVector::fmadd(m0419a, self.twiddles_re[5], x10p13);
let m0419a = SseVector::fmadd(m0419a, self.twiddles_re[1], x11p12);
let m0419b = SseVector::mul(self.twiddles_im[3], x1m22);
let m0419b = SseVector::fmadd(m0419b, self.twiddles_im[7], x2m21);
let m0419b = SseVector::nmadd(m0419b, self.twiddles_im[10], x3m20);
let m0419b = SseVector::nmadd(m0419b, self.twiddles_im[6], x4m19);
let m0419b = SseVector::nmadd(m0419b, self.twiddles_im[2], x5m18);
let m0419b = SseVector::fmadd(m0419b, self.twiddles_im[0], x6m17);
let m0419b = SseVector::fmadd(m0419b, self.twiddles_im[4], x7m16);
let m0419b = SseVector::fmadd(m0419b, self.twiddles_im[8], x8m15);
let m0419b = SseVector::nmadd(m0419b, self.twiddles_im[9], x9m14);
let m0419b = SseVector::nmadd(m0419b, self.twiddles_im[5], x10m13);
let m0419b = SseVector::nmadd(m0419b, self.twiddles_im[1], x11m12);
let [y04, y19] = SseVector::column_butterfly2([m0419a, m0419b]);
let m0518a = SseVector::fmadd(values[0], self.twiddles_re[4], x1p22);
let m0518a = SseVector::fmadd(m0518a, self.twiddles_re[9], x2p21);
let m0518a = SseVector::fmadd(m0518a, self.twiddles_re[7], x3p20);
let m0518a = SseVector::fmadd(m0518a, self.twiddles_re[2], x4p19);
let m0518a = SseVector::fmadd(m0518a, self.twiddles_re[1], x5p18);
let m0518a = SseVector::fmadd(m0518a, self.twiddles_re[6], x6p17);
let m0518a = SseVector::fmadd(m0518a, self.twiddles_re[10], x7p16);
let m0518a = SseVector::fmadd(m0518a, self.twiddles_re[5], x8p15);
let m0518a = SseVector::fmadd(m0518a, self.twiddles_re[0], x9p14);
let m0518a = SseVector::fmadd(m0518a, self.twiddles_re[3], x10p13);
let m0518a = SseVector::fmadd(m0518a, self.twiddles_re[8], x11p12);
let m0518b = SseVector::mul(self.twiddles_im[4], x1m22);
let m0518b = SseVector::fmadd(m0518b, self.twiddles_im[9], x2m21);
let m0518b = SseVector::nmadd(m0518b, self.twiddles_im[7], x3m20);
let m0518b = SseVector::nmadd(m0518b, self.twiddles_im[2], x4m19);
let m0518b = SseVector::fmadd(m0518b, self.twiddles_im[1], x5m18);
let m0518b = SseVector::fmadd(m0518b, self.twiddles_im[6], x6m17);
let m0518b = SseVector::nmadd(m0518b, self.twiddles_im[10], x7m16);
let m0518b = SseVector::nmadd(m0518b, self.twiddles_im[5], x8m15);
let m0518b = SseVector::nmadd(m0518b, self.twiddles_im[0], x9m14);
let m0518b = SseVector::fmadd(m0518b, self.twiddles_im[3], x10m13);
let m0518b = SseVector::fmadd(m0518b, self.twiddles_im[8], x11m12);
let [y05, y18] = SseVector::column_butterfly2([m0518a, m0518b]);
let m0617a = SseVector::fmadd(values[0], self.twiddles_re[5], x1p22);
let m0617a = SseVector::fmadd(m0617a, self.twiddles_re[10], x2p21);
let m0617a = SseVector::fmadd(m0617a, self.twiddles_re[4], x3p20);
let m0617a = SseVector::fmadd(m0617a, self.twiddles_re[0], x4p19);
let m0617a = SseVector::fmadd(m0617a, self.twiddles_re[6], x5p18);
let m0617a = SseVector::fmadd(m0617a, self.twiddles_re[9], x6p17);
let m0617a = SseVector::fmadd(m0617a, self.twiddles_re[3], x7p16);
let m0617a = SseVector::fmadd(m0617a, self.twiddles_re[1], x8p15);
let m0617a = SseVector::fmadd(m0617a, self.twiddles_re[7], x9p14);
let m0617a = SseVector::fmadd(m0617a, self.twiddles_re[8], x10p13);
let m0617a = SseVector::fmadd(m0617a, self.twiddles_re[2], x11p12);
let m0617b = SseVector::mul(self.twiddles_im[5], x1m22);
let m0617b = SseVector::nmadd(m0617b, self.twiddles_im[10], x2m21);
let m0617b = SseVector::nmadd(m0617b, self.twiddles_im[4], x3m20);
let m0617b = SseVector::fmadd(m0617b, self.twiddles_im[0], x4m19);
let m0617b = SseVector::fmadd(m0617b, self.twiddles_im[6], x5m18);
let m0617b = SseVector::nmadd(m0617b, self.twiddles_im[9], x6m17);
let m0617b = SseVector::nmadd(m0617b, self.twiddles_im[3], x7m16);
let m0617b = SseVector::fmadd(m0617b, self.twiddles_im[1], x8m15);
let m0617b = SseVector::fmadd(m0617b, self.twiddles_im[7], x9m14);
let m0617b = SseVector::nmadd(m0617b, self.twiddles_im[8], x10m13);
let m0617b = SseVector::nmadd(m0617b, self.twiddles_im[2], x11m12);
let [y06, y17] = SseVector::column_butterfly2([m0617a, m0617b]);
let m0716a = SseVector::fmadd(values[0], self.twiddles_re[6], x1p22);
let m0716a = SseVector::fmadd(m0716a, self.twiddles_re[8], x2p21);
let m0716a = SseVector::fmadd(m0716a, self.twiddles_re[1], x3p20);
let m0716a = SseVector::fmadd(m0716a, self.twiddles_re[4], x4p19);
let m0716a = SseVector::fmadd(m0716a, self.twiddles_re[10], x5p18);
let m0716a = SseVector::fmadd(m0716a, self.twiddles_re[3], x6p17);
let m0716a = SseVector::fmadd(m0716a, self.twiddles_re[2], x7p16);
let m0716a = SseVector::fmadd(m0716a, self.twiddles_re[9], x8p15);
let m0716a = SseVector::fmadd(m0716a, self.twiddles_re[5], x9p14);
let m0716a = SseVector::fmadd(m0716a, self.twiddles_re[0], x10p13);
let m0716a = SseVector::fmadd(m0716a, self.twiddles_re[7], x11p12);
let m0716b = SseVector::mul(self.twiddles_im[6], x1m22);
let m0716b = SseVector::nmadd(m0716b, self.twiddles_im[8], x2m21);
let m0716b = SseVector::nmadd(m0716b, self.twiddles_im[1], x3m20);
let m0716b = SseVector::fmadd(m0716b, self.twiddles_im[4], x4m19);
let m0716b = SseVector::nmadd(m0716b, self.twiddles_im[10], x5m18);
let m0716b = SseVector::nmadd(m0716b, self.twiddles_im[3], x6m17);
let m0716b = SseVector::fmadd(m0716b, self.twiddles_im[2], x7m16);
let m0716b = SseVector::fmadd(m0716b, self.twiddles_im[9], x8m15);
let m0716b = SseVector::nmadd(m0716b, self.twiddles_im[5], x9m14);
let m0716b = SseVector::fmadd(m0716b, self.twiddles_im[0], x10m13);
let m0716b = SseVector::fmadd(m0716b, self.twiddles_im[7], x11m12);
let [y07, y16] = SseVector::column_butterfly2([m0716a, m0716b]);
let m0815a = SseVector::fmadd(values[0], self.twiddles_re[7], x1p22);
let m0815a = SseVector::fmadd(m0815a, self.twiddles_re[6], x2p21);
let m0815a = SseVector::fmadd(m0815a, self.twiddles_re[0], x3p20);
let m0815a = SseVector::fmadd(m0815a, self.twiddles_re[8], x4p19);
let m0815a = SseVector::fmadd(m0815a, self.twiddles_re[5], x5p18);
let m0815a = SseVector::fmadd(m0815a, self.twiddles_re[1], x6p17);
let m0815a = SseVector::fmadd(m0815a, self.twiddles_re[9], x7p16);
let m0815a = SseVector::fmadd(m0815a, self.twiddles_re[4], x8p15);
let m0815a = SseVector::fmadd(m0815a, self.twiddles_re[2], x9p14);
let m0815a = SseVector::fmadd(m0815a, self.twiddles_re[10], x10p13);
let m0815a = SseVector::fmadd(m0815a, self.twiddles_re[3], x11p12);
let m0815b = SseVector::mul(self.twiddles_im[7], x1m22);
let m0815b = SseVector::nmadd(m0815b, self.twiddles_im[6], x2m21);
let m0815b = SseVector::fmadd(m0815b, self.twiddles_im[0], x3m20);
let m0815b = SseVector::fmadd(m0815b, self.twiddles_im[8], x4m19);
let m0815b = SseVector::nmadd(m0815b, self.twiddles_im[5], x5m18);
let m0815b = SseVector::fmadd(m0815b, self.twiddles_im[1], x6m17);
let m0815b = SseVector::fmadd(m0815b, self.twiddles_im[9], x7m16);
let m0815b = SseVector::nmadd(m0815b, self.twiddles_im[4], x8m15);
let m0815b = SseVector::fmadd(m0815b, self.twiddles_im[2], x9m14);
let m0815b = SseVector::fmadd(m0815b, self.twiddles_im[10], x10m13);
let m0815b = SseVector::nmadd(m0815b, self.twiddles_im[3], x11m12);
let [y08, y15] = SseVector::column_butterfly2([m0815a, m0815b]);
let m0914a = SseVector::fmadd(values[0], self.twiddles_re[8], x1p22);
let m0914a = SseVector::fmadd(m0914a, self.twiddles_re[4], x2p21);
let m0914a = SseVector::fmadd(m0914a, self.twiddles_re[3], x3p20);
let m0914a = SseVector::fmadd(m0914a, self.twiddles_re[9], x4p19);
let m0914a = SseVector::fmadd(m0914a, self.twiddles_re[0], x5p18);
let m0914a = SseVector::fmadd(m0914a, self.twiddles_re[7], x6p17);
let m0914a = SseVector::fmadd(m0914a, self.twiddles_re[5], x7p16);
let m0914a = SseVector::fmadd(m0914a, self.twiddles_re[2], x8p15);
let m0914a = SseVector::fmadd(m0914a, self.twiddles_re[10], x9p14);
let m0914a = SseVector::fmadd(m0914a, self.twiddles_re[1], x10p13);
let m0914a = SseVector::fmadd(m0914a, self.twiddles_re[6], x11p12);
let m0914b = SseVector::mul(self.twiddles_im[8], x1m22);
let m0914b = SseVector::nmadd(m0914b, self.twiddles_im[4], x2m21);
let m0914b = SseVector::fmadd(m0914b, self.twiddles_im[3], x3m20);
let m0914b = SseVector::nmadd(m0914b, self.twiddles_im[9], x4m19);
let m0914b = SseVector::nmadd(m0914b, self.twiddles_im[0], x5m18);
let m0914b = SseVector::fmadd(m0914b, self.twiddles_im[7], x6m17);
let m0914b = SseVector::nmadd(m0914b, self.twiddles_im[5], x7m16);
let m0914b = SseVector::fmadd(m0914b, self.twiddles_im[2], x8m15);
let m0914b = SseVector::nmadd(m0914b, self.twiddles_im[10], x9m14);
let m0914b = SseVector::nmadd(m0914b, self.twiddles_im[1], x10m13);
let m0914b = SseVector::fmadd(m0914b, self.twiddles_im[6], x11m12);
let [y09, y14] = SseVector::column_butterfly2([m0914a, m0914b]);
let m1013a = SseVector::fmadd(values[0], self.twiddles_re[9], x1p22);
let m1013a = SseVector::fmadd(m1013a, self.twiddles_re[2], x2p21);
let m1013a = SseVector::fmadd(m1013a, self.twiddles_re[6], x3p20);
let m1013a = SseVector::fmadd(m1013a, self.twiddles_re[5], x4p19);
let m1013a = SseVector::fmadd(m1013a, self.twiddles_re[3], x5p18);
let m1013a = SseVector::fmadd(m1013a, self.twiddles_re[8], x6p17);
let m1013a = SseVector::fmadd(m1013a, self.twiddles_re[0], x7p16);
let m1013a = SseVector::fmadd(m1013a, self.twiddles_re[10], x8p15);
let m1013a = SseVector::fmadd(m1013a, self.twiddles_re[1], x9p14);
let m1013a = SseVector::fmadd(m1013a, self.twiddles_re[7], x10p13);
let m1013a = SseVector::fmadd(m1013a, self.twiddles_re[4], x11p12);
let m1013b = SseVector::mul(self.twiddles_im[9], x1m22);
let m1013b = SseVector::nmadd(m1013b, self.twiddles_im[2], x2m21);
let m1013b = SseVector::fmadd(m1013b, self.twiddles_im[6], x3m20);
let m1013b = SseVector::nmadd(m1013b, self.twiddles_im[5], x4m19);
let m1013b = SseVector::fmadd(m1013b, self.twiddles_im[3], x5m18);
let m1013b = SseVector::nmadd(m1013b, self.twiddles_im[8], x6m17);
let m1013b = SseVector::fmadd(m1013b, self.twiddles_im[0], x7m16);
let m1013b = SseVector::fmadd(m1013b, self.twiddles_im[10], x8m15);
let m1013b = SseVector::nmadd(m1013b, self.twiddles_im[1], x9m14);
let m1013b = SseVector::fmadd(m1013b, self.twiddles_im[7], x10m13);
let m1013b = SseVector::nmadd(m1013b, self.twiddles_im[4], x11m12);
let [y10, y13] = SseVector::column_butterfly2([m1013a, m1013b]);
let m1112a = SseVector::fmadd(values[0], self.twiddles_re[10], x1p22);
let m1112a = SseVector::fmadd(m1112a, self.twiddles_re[0], x2p21);
let m1112a = SseVector::fmadd(m1112a, self.twiddles_re[9], x3p20);
let m1112a = SseVector::fmadd(m1112a, self.twiddles_re[1], x4p19);
let m1112a = SseVector::fmadd(m1112a, self.twiddles_re[8], x5p18);
let m1112a = SseVector::fmadd(m1112a, self.twiddles_re[2], x6p17);
let m1112a = SseVector::fmadd(m1112a, self.twiddles_re[7], x7p16);
let m1112a = SseVector::fmadd(m1112a, self.twiddles_re[3], x8p15);
let m1112a = SseVector::fmadd(m1112a, self.twiddles_re[6], x9p14);
let m1112a = SseVector::fmadd(m1112a, self.twiddles_re[4], x10p13);
let m1112a = SseVector::fmadd(m1112a, self.twiddles_re[5], x11p12);
let m1112b = SseVector::mul(self.twiddles_im[10], x1m22);
let m1112b = SseVector::nmadd(m1112b, self.twiddles_im[0], x2m21);
let m1112b = SseVector::fmadd(m1112b, self.twiddles_im[9], x3m20);
let m1112b = SseVector::nmadd(m1112b, self.twiddles_im[1], x4m19);
let m1112b = SseVector::fmadd(m1112b, self.twiddles_im[8], x5m18);
let m1112b = SseVector::nmadd(m1112b, self.twiddles_im[2], x6m17);
let m1112b = SseVector::fmadd(m1112b, self.twiddles_im[7], x7m16);
let m1112b = SseVector::nmadd(m1112b, self.twiddles_im[3], x8m15);
let m1112b = SseVector::fmadd(m1112b, self.twiddles_im[6], x9m14);
let m1112b = SseVector::nmadd(m1112b, self.twiddles_im[4], x10m13);
let m1112b = SseVector::fmadd(m1112b, self.twiddles_im[5], x11m12);
let [y11, y12] = SseVector::column_butterfly2([m1112a, m1112b]);
[y00, y01, y02, y03, y04, y05, y06, y07, y08, y09, y10, y11, y12, y13, y14, y15, y16, y17, y18, y19, y20, y21, y22]
}
}
struct SseF32Butterfly29<T> {
direction: FftDirection,
twiddles_re: [__m128; 14],
twiddles_im: [__m128; 14],
_phantom: std::marker::PhantomData<T>,
}
boilerplate_fft_sse_f32_butterfly!(SseF32Butterfly29, 29, |this: &SseF32Butterfly29<_>| this.direction);
impl<T: FftNum> SseF32Butterfly29<T> {
#[target_feature(enable = "sse4.1")]
unsafe fn new(direction: FftDirection) -> Self {
assert_f32::<T>();
let twiddles = make_twiddles(29, direction);
Self {
direction,
twiddles_re: twiddles.map(|t| SseVector::broadcast_scalar(t.re)),
twiddles_im: twiddles.map(|t| SseVector::broadcast_scalar(t.im)),
_phantom: std::marker::PhantomData,
}
}
#[inline(always)]
pub(crate) unsafe fn perform_fft_contiguous(&self, mut buffer: impl SseArrayMut<f32>) {
let values = read_partial1_complex_to_array!(buffer, { 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28 });
let out = self.perform_parallel_fft_direct(values);
write_partial_lo_complex_to_array!(out, buffer, { 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28 } );
}
#[inline(always)]
pub(crate) unsafe fn perform_parallel_fft_contiguous(&self, mut buffer: impl SseArrayMut<f32>) {
let input_packed = read_complex_to_array!(buffer, { 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56 });
let values = [
extract_lo_hi_f32(input_packed[0], input_packed[14]),
extract_hi_lo_f32(input_packed[0], input_packed[15]),
extract_lo_hi_f32(input_packed[1], input_packed[15]),
extract_hi_lo_f32(input_packed[1], input_packed[16]),
extract_lo_hi_f32(input_packed[2], input_packed[16]),
extract_hi_lo_f32(input_packed[2], input_packed[17]),
extract_lo_hi_f32(input_packed[3], input_packed[17]),
extract_hi_lo_f32(input_packed[3], input_packed[18]),
extract_lo_hi_f32(input_packed[4], input_packed[18]),
extract_hi_lo_f32(input_packed[4], input_packed[19]),
extract_lo_hi_f32(input_packed[5], input_packed[19]),
extract_hi_lo_f32(input_packed[5], input_packed[20]),
extract_lo_hi_f32(input_packed[6], input_packed[20]),
extract_hi_lo_f32(input_packed[6], input_packed[21]),
extract_lo_hi_f32(input_packed[7], input_packed[21]),
extract_hi_lo_f32(input_packed[7], input_packed[22]),
extract_lo_hi_f32(input_packed[8], input_packed[22]),
extract_hi_lo_f32(input_packed[8], input_packed[23]),
extract_lo_hi_f32(input_packed[9], input_packed[23]),
extract_hi_lo_f32(input_packed[9], input_packed[24]),
extract_lo_hi_f32(input_packed[10], input_packed[24]),
extract_hi_lo_f32(input_packed[10], input_packed[25]),
extract_lo_hi_f32(input_packed[11], input_packed[25]),
extract_hi_lo_f32(input_packed[11], input_packed[26]),
extract_lo_hi_f32(input_packed[12], input_packed[26]),
extract_hi_lo_f32(input_packed[12], input_packed[27]),
extract_lo_hi_f32(input_packed[13], input_packed[27]),
extract_hi_lo_f32(input_packed[13], input_packed[28]),
extract_lo_hi_f32(input_packed[14], input_packed[28]),
];
let out = self.perform_parallel_fft_direct(values);
let out_packed = [
extract_lo_lo_f32(out[0], out[1]),
extract_lo_lo_f32(out[2], out[3]),
extract_lo_lo_f32(out[4], out[5]),
extract_lo_lo_f32(out[6], out[7]),
extract_lo_lo_f32(out[8], out[9]),
extract_lo_lo_f32(out[10], out[11]),
extract_lo_lo_f32(out[12], out[13]),
extract_lo_lo_f32(out[14], out[15]),
extract_lo_lo_f32(out[16], out[17]),
extract_lo_lo_f32(out[18], out[19]),
extract_lo_lo_f32(out[20], out[21]),
extract_lo_lo_f32(out[22], out[23]),
extract_lo_lo_f32(out[24], out[25]),
extract_lo_lo_f32(out[26], out[27]),
extract_lo_hi_f32(out[28], out[0]),
extract_hi_hi_f32(out[1], out[2]),
extract_hi_hi_f32(out[3], out[4]),
extract_hi_hi_f32(out[5], out[6]),
extract_hi_hi_f32(out[7], out[8]),
extract_hi_hi_f32(out[9], out[10]),
extract_hi_hi_f32(out[11], out[12]),
extract_hi_hi_f32(out[13], out[14]),
extract_hi_hi_f32(out[15], out[16]),
extract_hi_hi_f32(out[17], out[18]),
extract_hi_hi_f32(out[19], out[20]),
extract_hi_hi_f32(out[21], out[22]),
extract_hi_hi_f32(out[23], out[24]),
extract_hi_hi_f32(out[25], out[26]),
extract_hi_hi_f32(out[27], out[28]),
];
write_complex_to_array_strided!(out_packed, buffer, 2, { 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28 });
}
#[inline(always)]
pub(crate) unsafe fn perform_parallel_fft_direct(&self, values: [__m128; 29]) -> [__m128; 29] {
let rotate = SseVector::make_rotate90(FftDirection::Inverse);
let y00 = values[0];
let [x1p28, x1m28] = SseVector::column_butterfly2([values[1], values[28]]);
let x1m28 = SseVector::apply_rotate90(rotate, x1m28);
let y00 = SseVector::add(y00, x1p28);
let [x2p27, x2m27] = SseVector::column_butterfly2([values[2], values[27]]);
let x2m27 = SseVector::apply_rotate90(rotate, x2m27);
let y00 = SseVector::add(y00, x2p27);
let [x3p26, x3m26] = SseVector::column_butterfly2([values[3], values[26]]);
let x3m26 = SseVector::apply_rotate90(rotate, x3m26);
let y00 = SseVector::add(y00, x3p26);
let [x4p25, x4m25] = SseVector::column_butterfly2([values[4], values[25]]);
let x4m25 = SseVector::apply_rotate90(rotate, x4m25);
let y00 = SseVector::add(y00, x4p25);
let [x5p24, x5m24] = SseVector::column_butterfly2([values[5], values[24]]);
let x5m24 = SseVector::apply_rotate90(rotate, x5m24);
let y00 = SseVector::add(y00, x5p24);
let [x6p23, x6m23] = SseVector::column_butterfly2([values[6], values[23]]);
let x6m23 = SseVector::apply_rotate90(rotate, x6m23);
let y00 = SseVector::add(y00, x6p23);
let [x7p22, x7m22] = SseVector::column_butterfly2([values[7], values[22]]);
let x7m22 = SseVector::apply_rotate90(rotate, x7m22);
let y00 = SseVector::add(y00, x7p22);
let [x8p21, x8m21] = SseVector::column_butterfly2([values[8], values[21]]);
let x8m21 = SseVector::apply_rotate90(rotate, x8m21);
let y00 = SseVector::add(y00, x8p21);
let [x9p20, x9m20] = SseVector::column_butterfly2([values[9], values[20]]);
let x9m20 = SseVector::apply_rotate90(rotate, x9m20);
let y00 = SseVector::add(y00, x9p20);
let [x10p19, x10m19] = SseVector::column_butterfly2([values[10], values[19]]);
let x10m19 = SseVector::apply_rotate90(rotate, x10m19);
let y00 = SseVector::add(y00, x10p19);
let [x11p18, x11m18] = SseVector::column_butterfly2([values[11], values[18]]);
let x11m18 = SseVector::apply_rotate90(rotate, x11m18);
let y00 = SseVector::add(y00, x11p18);
let [x12p17, x12m17] = SseVector::column_butterfly2([values[12], values[17]]);
let x12m17 = SseVector::apply_rotate90(rotate, x12m17);
let y00 = SseVector::add(y00, x12p17);
let [x13p16, x13m16] = SseVector::column_butterfly2([values[13], values[16]]);
let x13m16 = SseVector::apply_rotate90(rotate, x13m16);
let y00 = SseVector::add(y00, x13p16);
let [x14p15, x14m15] = SseVector::column_butterfly2([values[14], values[15]]);
let x14m15 = SseVector::apply_rotate90(rotate, x14m15);
let y00 = SseVector::add(y00, x14p15);
let m0128a = SseVector::fmadd(values[0], self.twiddles_re[0], x1p28);
let m0128a = SseVector::fmadd(m0128a, self.twiddles_re[1], x2p27);
let m0128a = SseVector::fmadd(m0128a, self.twiddles_re[2], x3p26);
let m0128a = SseVector::fmadd(m0128a, self.twiddles_re[3], x4p25);
let m0128a = SseVector::fmadd(m0128a, self.twiddles_re[4], x5p24);
let m0128a = SseVector::fmadd(m0128a, self.twiddles_re[5], x6p23);
let m0128a = SseVector::fmadd(m0128a, self.twiddles_re[6], x7p22);
let m0128a = SseVector::fmadd(m0128a, self.twiddles_re[7], x8p21);
let m0128a = SseVector::fmadd(m0128a, self.twiddles_re[8], x9p20);
let m0128a = SseVector::fmadd(m0128a, self.twiddles_re[9], x10p19);
let m0128a = SseVector::fmadd(m0128a, self.twiddles_re[10], x11p18);
let m0128a = SseVector::fmadd(m0128a, self.twiddles_re[11], x12p17);
let m0128a = SseVector::fmadd(m0128a, self.twiddles_re[12], x13p16);
let m0128a = SseVector::fmadd(m0128a, self.twiddles_re[13], x14p15);
let m0128b = SseVector::mul(self.twiddles_im[0], x1m28);
let m0128b = SseVector::fmadd(m0128b, self.twiddles_im[1], x2m27);
let m0128b = SseVector::fmadd(m0128b, self.twiddles_im[2], x3m26);
let m0128b = SseVector::fmadd(m0128b, self.twiddles_im[3], x4m25);
let m0128b = SseVector::fmadd(m0128b, self.twiddles_im[4], x5m24);
let m0128b = SseVector::fmadd(m0128b, self.twiddles_im[5], x6m23);
let m0128b = SseVector::fmadd(m0128b, self.twiddles_im[6], x7m22);
let m0128b = SseVector::fmadd(m0128b, self.twiddles_im[7], x8m21);
let m0128b = SseVector::fmadd(m0128b, self.twiddles_im[8], x9m20);
let m0128b = SseVector::fmadd(m0128b, self.twiddles_im[9], x10m19);
let m0128b = SseVector::fmadd(m0128b, self.twiddles_im[10], x11m18);
let m0128b = SseVector::fmadd(m0128b, self.twiddles_im[11], x12m17);
let m0128b = SseVector::fmadd(m0128b, self.twiddles_im[12], x13m16);
let m0128b = SseVector::fmadd(m0128b, self.twiddles_im[13], x14m15);
let [y01, y28] = SseVector::column_butterfly2([m0128a, m0128b]);
let m0227a = SseVector::fmadd(values[0], self.twiddles_re[1], x1p28);
let m0227a = SseVector::fmadd(m0227a, self.twiddles_re[3], x2p27);
let m0227a = SseVector::fmadd(m0227a, self.twiddles_re[5], x3p26);
let m0227a = SseVector::fmadd(m0227a, self.twiddles_re[7], x4p25);
let m0227a = SseVector::fmadd(m0227a, self.twiddles_re[9], x5p24);
let m0227a = SseVector::fmadd(m0227a, self.twiddles_re[11], x6p23);
let m0227a = SseVector::fmadd(m0227a, self.twiddles_re[13], x7p22);
let m0227a = SseVector::fmadd(m0227a, self.twiddles_re[12], x8p21);
let m0227a = SseVector::fmadd(m0227a, self.twiddles_re[10], x9p20);
let m0227a = SseVector::fmadd(m0227a, self.twiddles_re[8], x10p19);
let m0227a = SseVector::fmadd(m0227a, self.twiddles_re[6], x11p18);
let m0227a = SseVector::fmadd(m0227a, self.twiddles_re[4], x12p17);
let m0227a = SseVector::fmadd(m0227a, self.twiddles_re[2], x13p16);
let m0227a = SseVector::fmadd(m0227a, self.twiddles_re[0], x14p15);
let m0227b = SseVector::mul(self.twiddles_im[1], x1m28);
let m0227b = SseVector::fmadd(m0227b, self.twiddles_im[3], x2m27);
let m0227b = SseVector::fmadd(m0227b, self.twiddles_im[5], x3m26);
let m0227b = SseVector::fmadd(m0227b, self.twiddles_im[7], x4m25);
let m0227b = SseVector::fmadd(m0227b, self.twiddles_im[9], x5m24);
let m0227b = SseVector::fmadd(m0227b, self.twiddles_im[11], x6m23);
let m0227b = SseVector::fmadd(m0227b, self.twiddles_im[13], x7m22);
let m0227b = SseVector::nmadd(m0227b, self.twiddles_im[12], x8m21);
let m0227b = SseVector::nmadd(m0227b, self.twiddles_im[10], x9m20);
let m0227b = SseVector::nmadd(m0227b, self.twiddles_im[8], x10m19);
let m0227b = SseVector::nmadd(m0227b, self.twiddles_im[6], x11m18);
let m0227b = SseVector::nmadd(m0227b, self.twiddles_im[4], x12m17);
let m0227b = SseVector::nmadd(m0227b, self.twiddles_im[2], x13m16);
let m0227b = SseVector::nmadd(m0227b, self.twiddles_im[0], x14m15);
let [y02, y27] = SseVector::column_butterfly2([m0227a, m0227b]);
let m0326a = SseVector::fmadd(values[0], self.twiddles_re[2], x1p28);
let m0326a = SseVector::fmadd(m0326a, self.twiddles_re[5], x2p27);
let m0326a = SseVector::fmadd(m0326a, self.twiddles_re[8], x3p26);
let m0326a = SseVector::fmadd(m0326a, self.twiddles_re[11], x4p25);
let m0326a = SseVector::fmadd(m0326a, self.twiddles_re[13], x5p24);
let m0326a = SseVector::fmadd(m0326a, self.twiddles_re[10], x6p23);
let m0326a = SseVector::fmadd(m0326a, self.twiddles_re[7], x7p22);
let m0326a = SseVector::fmadd(m0326a, self.twiddles_re[4], x8p21);
let m0326a = SseVector::fmadd(m0326a, self.twiddles_re[1], x9p20);
let m0326a = SseVector::fmadd(m0326a, self.twiddles_re[0], x10p19);
let m0326a = SseVector::fmadd(m0326a, self.twiddles_re[3], x11p18);
let m0326a = SseVector::fmadd(m0326a, self.twiddles_re[6], x12p17);
let m0326a = SseVector::fmadd(m0326a, self.twiddles_re[9], x13p16);
let m0326a = SseVector::fmadd(m0326a, self.twiddles_re[12], x14p15);
let m0326b = SseVector::mul(self.twiddles_im[2], x1m28);
let m0326b = SseVector::fmadd(m0326b, self.twiddles_im[5], x2m27);
let m0326b = SseVector::fmadd(m0326b, self.twiddles_im[8], x3m26);
let m0326b = SseVector::fmadd(m0326b, self.twiddles_im[11], x4m25);
let m0326b = SseVector::nmadd(m0326b, self.twiddles_im[13], x5m24);
let m0326b = SseVector::nmadd(m0326b, self.twiddles_im[10], x6m23);
let m0326b = SseVector::nmadd(m0326b, self.twiddles_im[7], x7m22);
let m0326b = SseVector::nmadd(m0326b, self.twiddles_im[4], x8m21);
let m0326b = SseVector::nmadd(m0326b, self.twiddles_im[1], x9m20);
let m0326b = SseVector::fmadd(m0326b, self.twiddles_im[0], x10m19);
let m0326b = SseVector::fmadd(m0326b, self.twiddles_im[3], x11m18);
let m0326b = SseVector::fmadd(m0326b, self.twiddles_im[6], x12m17);
let m0326b = SseVector::fmadd(m0326b, self.twiddles_im[9], x13m16);
let m0326b = SseVector::fmadd(m0326b, self.twiddles_im[12], x14m15);
let [y03, y26] = SseVector::column_butterfly2([m0326a, m0326b]);
let m0425a = SseVector::fmadd(values[0], self.twiddles_re[3], x1p28);
let m0425a = SseVector::fmadd(m0425a, self.twiddles_re[7], x2p27);
let m0425a = SseVector::fmadd(m0425a, self.twiddles_re[11], x3p26);
let m0425a = SseVector::fmadd(m0425a, self.twiddles_re[12], x4p25);
let m0425a = SseVector::fmadd(m0425a, self.twiddles_re[8], x5p24);
let m0425a = SseVector::fmadd(m0425a, self.twiddles_re[4], x6p23);
let m0425a = SseVector::fmadd(m0425a, self.twiddles_re[0], x7p22);
let m0425a = SseVector::fmadd(m0425a, self.twiddles_re[2], x8p21);
let m0425a = SseVector::fmadd(m0425a, self.twiddles_re[6], x9p20);
let m0425a = SseVector::fmadd(m0425a, self.twiddles_re[10], x10p19);
let m0425a = SseVector::fmadd(m0425a, self.twiddles_re[13], x11p18);
let m0425a = SseVector::fmadd(m0425a, self.twiddles_re[9], x12p17);
let m0425a = SseVector::fmadd(m0425a, self.twiddles_re[5], x13p16);
let m0425a = SseVector::fmadd(m0425a, self.twiddles_re[1], x14p15);
let m0425b = SseVector::mul(self.twiddles_im[3], x1m28);
let m0425b = SseVector::fmadd(m0425b, self.twiddles_im[7], x2m27);
let m0425b = SseVector::fmadd(m0425b, self.twiddles_im[11], x3m26);
let m0425b = SseVector::nmadd(m0425b, self.twiddles_im[12], x4m25);
let m0425b = SseVector::nmadd(m0425b, self.twiddles_im[8], x5m24);
let m0425b = SseVector::nmadd(m0425b, self.twiddles_im[4], x6m23);
let m0425b = SseVector::nmadd(m0425b, self.twiddles_im[0], x7m22);
let m0425b = SseVector::fmadd(m0425b, self.twiddles_im[2], x8m21);
let m0425b = SseVector::fmadd(m0425b, self.twiddles_im[6], x9m20);
let m0425b = SseVector::fmadd(m0425b, self.twiddles_im[10], x10m19);
let m0425b = SseVector::nmadd(m0425b, self.twiddles_im[13], x11m18);
let m0425b = SseVector::nmadd(m0425b, self.twiddles_im[9], x12m17);
let m0425b = SseVector::nmadd(m0425b, self.twiddles_im[5], x13m16);
let m0425b = SseVector::nmadd(m0425b, self.twiddles_im[1], x14m15);
let [y04, y25] = SseVector::column_butterfly2([m0425a, m0425b]);
let m0524a = SseVector::fmadd(values[0], self.twiddles_re[4], x1p28);
let m0524a = SseVector::fmadd(m0524a, self.twiddles_re[9], x2p27);
let m0524a = SseVector::fmadd(m0524a, self.twiddles_re[13], x3p26);
let m0524a = SseVector::fmadd(m0524a, self.twiddles_re[8], x4p25);
let m0524a = SseVector::fmadd(m0524a, self.twiddles_re[3], x5p24);
let m0524a = SseVector::fmadd(m0524a, self.twiddles_re[0], x6p23);
let m0524a = SseVector::fmadd(m0524a, self.twiddles_re[5], x7p22);
let m0524a = SseVector::fmadd(m0524a, self.twiddles_re[10], x8p21);
let m0524a = SseVector::fmadd(m0524a, self.twiddles_re[12], x9p20);
let m0524a = SseVector::fmadd(m0524a, self.twiddles_re[7], x10p19);
let m0524a = SseVector::fmadd(m0524a, self.twiddles_re[2], x11p18);
let m0524a = SseVector::fmadd(m0524a, self.twiddles_re[1], x12p17);
let m0524a = SseVector::fmadd(m0524a, self.twiddles_re[6], x13p16);
let m0524a = SseVector::fmadd(m0524a, self.twiddles_re[11], x14p15);
let m0524b = SseVector::mul(self.twiddles_im[4], x1m28);
let m0524b = SseVector::fmadd(m0524b, self.twiddles_im[9], x2m27);
let m0524b = SseVector::nmadd(m0524b, self.twiddles_im[13], x3m26);
let m0524b = SseVector::nmadd(m0524b, self.twiddles_im[8], x4m25);
let m0524b = SseVector::nmadd(m0524b, self.twiddles_im[3], x5m24);
let m0524b = SseVector::fmadd(m0524b, self.twiddles_im[0], x6m23);
let m0524b = SseVector::fmadd(m0524b, self.twiddles_im[5], x7m22);
let m0524b = SseVector::fmadd(m0524b, self.twiddles_im[10], x8m21);
let m0524b = SseVector::nmadd(m0524b, self.twiddles_im[12], x9m20);
let m0524b = SseVector::nmadd(m0524b, self.twiddles_im[7], x10m19);
let m0524b = SseVector::nmadd(m0524b, self.twiddles_im[2], x11m18);
let m0524b = SseVector::fmadd(m0524b, self.twiddles_im[1], x12m17);
let m0524b = SseVector::fmadd(m0524b, self.twiddles_im[6], x13m16);
let m0524b = SseVector::fmadd(m0524b, self.twiddles_im[11], x14m15);
let [y05, y24] = SseVector::column_butterfly2([m0524a, m0524b]);
let m0623a = SseVector::fmadd(values[0], self.twiddles_re[5], x1p28);
let m0623a = SseVector::fmadd(m0623a, self.twiddles_re[11], x2p27);
let m0623a = SseVector::fmadd(m0623a, self.twiddles_re[10], x3p26);
let m0623a = SseVector::fmadd(m0623a, self.twiddles_re[4], x4p25);
let m0623a = SseVector::fmadd(m0623a, self.twiddles_re[0], x5p24);
let m0623a = SseVector::fmadd(m0623a, self.twiddles_re[6], x6p23);
let m0623a = SseVector::fmadd(m0623a, self.twiddles_re[12], x7p22);
let m0623a = SseVector::fmadd(m0623a, self.twiddles_re[9], x8p21);
let m0623a = SseVector::fmadd(m0623a, self.twiddles_re[3], x9p20);
let m0623a = SseVector::fmadd(m0623a, self.twiddles_re[1], x10p19);
let m0623a = SseVector::fmadd(m0623a, self.twiddles_re[7], x11p18);
let m0623a = SseVector::fmadd(m0623a, self.twiddles_re[13], x12p17);
let m0623a = SseVector::fmadd(m0623a, self.twiddles_re[8], x13p16);
let m0623a = SseVector::fmadd(m0623a, self.twiddles_re[2], x14p15);
let m0623b = SseVector::mul(self.twiddles_im[5], x1m28);
let m0623b = SseVector::fmadd(m0623b, self.twiddles_im[11], x2m27);
let m0623b = SseVector::nmadd(m0623b, self.twiddles_im[10], x3m26);
let m0623b = SseVector::nmadd(m0623b, self.twiddles_im[4], x4m25);
let m0623b = SseVector::fmadd(m0623b, self.twiddles_im[0], x5m24);
let m0623b = SseVector::fmadd(m0623b, self.twiddles_im[6], x6m23);
let m0623b = SseVector::fmadd(m0623b, self.twiddles_im[12], x7m22);
let m0623b = SseVector::nmadd(m0623b, self.twiddles_im[9], x8m21);
let m0623b = SseVector::nmadd(m0623b, self.twiddles_im[3], x9m20);
let m0623b = SseVector::fmadd(m0623b, self.twiddles_im[1], x10m19);
let m0623b = SseVector::fmadd(m0623b, self.twiddles_im[7], x11m18);
let m0623b = SseVector::fmadd(m0623b, self.twiddles_im[13], x12m17);
let m0623b = SseVector::nmadd(m0623b, self.twiddles_im[8], x13m16);
let m0623b = SseVector::nmadd(m0623b, self.twiddles_im[2], x14m15);
let [y06, y23] = SseVector::column_butterfly2([m0623a, m0623b]);
let m0722a = SseVector::fmadd(values[0], self.twiddles_re[6], x1p28);
let m0722a = SseVector::fmadd(m0722a, self.twiddles_re[13], x2p27);
let m0722a = SseVector::fmadd(m0722a, self.twiddles_re[7], x3p26);
let m0722a = SseVector::fmadd(m0722a, self.twiddles_re[0], x4p25);
let m0722a = SseVector::fmadd(m0722a, self.twiddles_re[5], x5p24);
let m0722a = SseVector::fmadd(m0722a, self.twiddles_re[12], x6p23);
let m0722a = SseVector::fmadd(m0722a, self.twiddles_re[8], x7p22);
let m0722a = SseVector::fmadd(m0722a, self.twiddles_re[1], x8p21);
let m0722a = SseVector::fmadd(m0722a, self.twiddles_re[4], x9p20);
let m0722a = SseVector::fmadd(m0722a, self.twiddles_re[11], x10p19);
let m0722a = SseVector::fmadd(m0722a, self.twiddles_re[9], x11p18);
let m0722a = SseVector::fmadd(m0722a, self.twiddles_re[2], x12p17);
let m0722a = SseVector::fmadd(m0722a, self.twiddles_re[3], x13p16);
let m0722a = SseVector::fmadd(m0722a, self.twiddles_re[10], x14p15);
let m0722b = SseVector::mul(self.twiddles_im[6], x1m28);
let m0722b = SseVector::fmadd(m0722b, self.twiddles_im[13], x2m27);
let m0722b = SseVector::nmadd(m0722b, self.twiddles_im[7], x3m26);
let m0722b = SseVector::nmadd(m0722b, self.twiddles_im[0], x4m25);
let m0722b = SseVector::fmadd(m0722b, self.twiddles_im[5], x5m24);
let m0722b = SseVector::fmadd(m0722b, self.twiddles_im[12], x6m23);
let m0722b = SseVector::nmadd(m0722b, self.twiddles_im[8], x7m22);
let m0722b = SseVector::nmadd(m0722b, self.twiddles_im[1], x8m21);
let m0722b = SseVector::fmadd(m0722b, self.twiddles_im[4], x9m20);
let m0722b = SseVector::fmadd(m0722b, self.twiddles_im[11], x10m19);
let m0722b = SseVector::nmadd(m0722b, self.twiddles_im[9], x11m18);
let m0722b = SseVector::nmadd(m0722b, self.twiddles_im[2], x12m17);
let m0722b = SseVector::fmadd(m0722b, self.twiddles_im[3], x13m16);
let m0722b = SseVector::fmadd(m0722b, self.twiddles_im[10], x14m15);
let [y07, y22] = SseVector::column_butterfly2([m0722a, m0722b]);
let m0821a = SseVector::fmadd(values[0], self.twiddles_re[7], x1p28);
let m0821a = SseVector::fmadd(m0821a, self.twiddles_re[12], x2p27);
let m0821a = SseVector::fmadd(m0821a, self.twiddles_re[4], x3p26);
let m0821a = SseVector::fmadd(m0821a, self.twiddles_re[2], x4p25);
let m0821a = SseVector::fmadd(m0821a, self.twiddles_re[10], x5p24);
let m0821a = SseVector::fmadd(m0821a, self.twiddles_re[9], x6p23);
let m0821a = SseVector::fmadd(m0821a, self.twiddles_re[1], x7p22);
let m0821a = SseVector::fmadd(m0821a, self.twiddles_re[5], x8p21);
let m0821a = SseVector::fmadd(m0821a, self.twiddles_re[13], x9p20);
let m0821a = SseVector::fmadd(m0821a, self.twiddles_re[6], x10p19);
let m0821a = SseVector::fmadd(m0821a, self.twiddles_re[0], x11p18);
let m0821a = SseVector::fmadd(m0821a, self.twiddles_re[8], x12p17);
let m0821a = SseVector::fmadd(m0821a, self.twiddles_re[11], x13p16);
let m0821a = SseVector::fmadd(m0821a, self.twiddles_re[3], x14p15);
let m0821b = SseVector::mul(self.twiddles_im[7], x1m28);
let m0821b = SseVector::nmadd(m0821b, self.twiddles_im[12], x2m27);
let m0821b = SseVector::nmadd(m0821b, self.twiddles_im[4], x3m26);
let m0821b = SseVector::fmadd(m0821b, self.twiddles_im[2], x4m25);
let m0821b = SseVector::fmadd(m0821b, self.twiddles_im[10], x5m24);
let m0821b = SseVector::nmadd(m0821b, self.twiddles_im[9], x6m23);
let m0821b = SseVector::nmadd(m0821b, self.twiddles_im[1], x7m22);
let m0821b = SseVector::fmadd(m0821b, self.twiddles_im[5], x8m21);
let m0821b = SseVector::fmadd(m0821b, self.twiddles_im[13], x9m20);
let m0821b = SseVector::nmadd(m0821b, self.twiddles_im[6], x10m19);
let m0821b = SseVector::fmadd(m0821b, self.twiddles_im[0], x11m18);
let m0821b = SseVector::fmadd(m0821b, self.twiddles_im[8], x12m17);
let m0821b = SseVector::nmadd(m0821b, self.twiddles_im[11], x13m16);
let m0821b = SseVector::nmadd(m0821b, self.twiddles_im[3], x14m15);
let [y08, y21] = SseVector::column_butterfly2([m0821a, m0821b]);
let m0920a = SseVector::fmadd(values[0], self.twiddles_re[8], x1p28);
let m0920a = SseVector::fmadd(m0920a, self.twiddles_re[10], x2p27);
let m0920a = SseVector::fmadd(m0920a, self.twiddles_re[1], x3p26);
let m0920a = SseVector::fmadd(m0920a, self.twiddles_re[6], x4p25);
let m0920a = SseVector::fmadd(m0920a, self.twiddles_re[12], x5p24);
let m0920a = SseVector::fmadd(m0920a, self.twiddles_re[3], x6p23);
let m0920a = SseVector::fmadd(m0920a, self.twiddles_re[4], x7p22);
let m0920a = SseVector::fmadd(m0920a, self.twiddles_re[13], x8p21);
let m0920a = SseVector::fmadd(m0920a, self.twiddles_re[5], x9p20);
let m0920a = SseVector::fmadd(m0920a, self.twiddles_re[2], x10p19);
let m0920a = SseVector::fmadd(m0920a, self.twiddles_re[11], x11p18);
let m0920a = SseVector::fmadd(m0920a, self.twiddles_re[7], x12p17);
let m0920a = SseVector::fmadd(m0920a, self.twiddles_re[0], x13p16);
let m0920a = SseVector::fmadd(m0920a, self.twiddles_re[9], x14p15);
let m0920b = SseVector::mul(self.twiddles_im[8], x1m28);
let m0920b = SseVector::nmadd(m0920b, self.twiddles_im[10], x2m27);
let m0920b = SseVector::nmadd(m0920b, self.twiddles_im[1], x3m26);
let m0920b = SseVector::fmadd(m0920b, self.twiddles_im[6], x4m25);
let m0920b = SseVector::nmadd(m0920b, self.twiddles_im[12], x5m24);
let m0920b = SseVector::nmadd(m0920b, self.twiddles_im[3], x6m23);
let m0920b = SseVector::fmadd(m0920b, self.twiddles_im[4], x7m22);
let m0920b = SseVector::fmadd(m0920b, self.twiddles_im[13], x8m21);
let m0920b = SseVector::nmadd(m0920b, self.twiddles_im[5], x9m20);
let m0920b = SseVector::fmadd(m0920b, self.twiddles_im[2], x10m19);
let m0920b = SseVector::fmadd(m0920b, self.twiddles_im[11], x11m18);
let m0920b = SseVector::nmadd(m0920b, self.twiddles_im[7], x12m17);
let m0920b = SseVector::fmadd(m0920b, self.twiddles_im[0], x13m16);
let m0920b = SseVector::fmadd(m0920b, self.twiddles_im[9], x14m15);
let [y09, y20] = SseVector::column_butterfly2([m0920a, m0920b]);
let m1019a = SseVector::fmadd(values[0], self.twiddles_re[9], x1p28);
let m1019a = SseVector::fmadd(m1019a, self.twiddles_re[8], x2p27);
let m1019a = SseVector::fmadd(m1019a, self.twiddles_re[0], x3p26);
let m1019a = SseVector::fmadd(m1019a, self.twiddles_re[10], x4p25);
let m1019a = SseVector::fmadd(m1019a, self.twiddles_re[7], x5p24);
let m1019a = SseVector::fmadd(m1019a, self.twiddles_re[1], x6p23);
let m1019a = SseVector::fmadd(m1019a, self.twiddles_re[11], x7p22);
let m1019a = SseVector::fmadd(m1019a, self.twiddles_re[6], x8p21);
let m1019a = SseVector::fmadd(m1019a, self.twiddles_re[2], x9p20);
let m1019a = SseVector::fmadd(m1019a, self.twiddles_re[12], x10p19);
let m1019a = SseVector::fmadd(m1019a, self.twiddles_re[5], x11p18);
let m1019a = SseVector::fmadd(m1019a, self.twiddles_re[3], x12p17);
let m1019a = SseVector::fmadd(m1019a, self.twiddles_re[13], x13p16);
let m1019a = SseVector::fmadd(m1019a, self.twiddles_re[4], x14p15);
let m1019b = SseVector::mul(self.twiddles_im[9], x1m28);
let m1019b = SseVector::nmadd(m1019b, self.twiddles_im[8], x2m27);
let m1019b = SseVector::fmadd(m1019b, self.twiddles_im[0], x3m26);
let m1019b = SseVector::fmadd(m1019b, self.twiddles_im[10], x4m25);
let m1019b = SseVector::nmadd(m1019b, self.twiddles_im[7], x5m24);
let m1019b = SseVector::fmadd(m1019b, self.twiddles_im[1], x6m23);
let m1019b = SseVector::fmadd(m1019b, self.twiddles_im[11], x7m22);
let m1019b = SseVector::nmadd(m1019b, self.twiddles_im[6], x8m21);
let m1019b = SseVector::fmadd(m1019b, self.twiddles_im[2], x9m20);
let m1019b = SseVector::fmadd(m1019b, self.twiddles_im[12], x10m19);
let m1019b = SseVector::nmadd(m1019b, self.twiddles_im[5], x11m18);
let m1019b = SseVector::fmadd(m1019b, self.twiddles_im[3], x12m17);
let m1019b = SseVector::fmadd(m1019b, self.twiddles_im[13], x13m16);
let m1019b = SseVector::nmadd(m1019b, self.twiddles_im[4], x14m15);
let [y10, y19] = SseVector::column_butterfly2([m1019a, m1019b]);
let m1118a = SseVector::fmadd(values[0], self.twiddles_re[10], x1p28);
let m1118a = SseVector::fmadd(m1118a, self.twiddles_re[6], x2p27);
let m1118a = SseVector::fmadd(m1118a, self.twiddles_re[3], x3p26);
let m1118a = SseVector::fmadd(m1118a, self.twiddles_re[13], x4p25);
let m1118a = SseVector::fmadd(m1118a, self.twiddles_re[2], x5p24);
let m1118a = SseVector::fmadd(m1118a, self.twiddles_re[7], x6p23);
let m1118a = SseVector::fmadd(m1118a, self.twiddles_re[9], x7p22);
let m1118a = SseVector::fmadd(m1118a, self.twiddles_re[0], x8p21);
let m1118a = SseVector::fmadd(m1118a, self.twiddles_re[11], x9p20);
let m1118a = SseVector::fmadd(m1118a, self.twiddles_re[5], x10p19);
let m1118a = SseVector::fmadd(m1118a, self.twiddles_re[4], x11p18);
let m1118a = SseVector::fmadd(m1118a, self.twiddles_re[12], x12p17);
let m1118a = SseVector::fmadd(m1118a, self.twiddles_re[1], x13p16);
let m1118a = SseVector::fmadd(m1118a, self.twiddles_re[8], x14p15);
let m1118b = SseVector::mul(self.twiddles_im[10], x1m28);
let m1118b = SseVector::nmadd(m1118b, self.twiddles_im[6], x2m27);
let m1118b = SseVector::fmadd(m1118b, self.twiddles_im[3], x3m26);
let m1118b = SseVector::nmadd(m1118b, self.twiddles_im[13], x4m25);
let m1118b = SseVector::nmadd(m1118b, self.twiddles_im[2], x5m24);
let m1118b = SseVector::fmadd(m1118b, self.twiddles_im[7], x6m23);
let m1118b = SseVector::nmadd(m1118b, self.twiddles_im[9], x7m22);
let m1118b = SseVector::fmadd(m1118b, self.twiddles_im[0], x8m21);
let m1118b = SseVector::fmadd(m1118b, self.twiddles_im[11], x9m20);
let m1118b = SseVector::nmadd(m1118b, self.twiddles_im[5], x10m19);
let m1118b = SseVector::fmadd(m1118b, self.twiddles_im[4], x11m18);
let m1118b = SseVector::nmadd(m1118b, self.twiddles_im[12], x12m17);
let m1118b = SseVector::nmadd(m1118b, self.twiddles_im[1], x13m16);
let m1118b = SseVector::fmadd(m1118b, self.twiddles_im[8], x14m15);
let [y11, y18] = SseVector::column_butterfly2([m1118a, m1118b]);
let m1217a = SseVector::fmadd(values[0], self.twiddles_re[11], x1p28);
let m1217a = SseVector::fmadd(m1217a, self.twiddles_re[4], x2p27);
let m1217a = SseVector::fmadd(m1217a, self.twiddles_re[6], x3p26);
let m1217a = SseVector::fmadd(m1217a, self.twiddles_re[9], x4p25);
let m1217a = SseVector::fmadd(m1217a, self.twiddles_re[1], x5p24);
let m1217a = SseVector::fmadd(m1217a, self.twiddles_re[13], x6p23);
let m1217a = SseVector::fmadd(m1217a, self.twiddles_re[2], x7p22);
let m1217a = SseVector::fmadd(m1217a, self.twiddles_re[8], x8p21);
let m1217a = SseVector::fmadd(m1217a, self.twiddles_re[7], x9p20);
let m1217a = SseVector::fmadd(m1217a, self.twiddles_re[3], x10p19);
let m1217a = SseVector::fmadd(m1217a, self.twiddles_re[12], x11p18);
let m1217a = SseVector::fmadd(m1217a, self.twiddles_re[0], x12p17);
let m1217a = SseVector::fmadd(m1217a, self.twiddles_re[10], x13p16);
let m1217a = SseVector::fmadd(m1217a, self.twiddles_re[5], x14p15);
let m1217b = SseVector::mul(self.twiddles_im[11], x1m28);
let m1217b = SseVector::nmadd(m1217b, self.twiddles_im[4], x2m27);
let m1217b = SseVector::fmadd(m1217b, self.twiddles_im[6], x3m26);
let m1217b = SseVector::nmadd(m1217b, self.twiddles_im[9], x4m25);
let m1217b = SseVector::fmadd(m1217b, self.twiddles_im[1], x5m24);
let m1217b = SseVector::fmadd(m1217b, self.twiddles_im[13], x6m23);
let m1217b = SseVector::nmadd(m1217b, self.twiddles_im[2], x7m22);
let m1217b = SseVector::fmadd(m1217b, self.twiddles_im[8], x8m21);
let m1217b = SseVector::nmadd(m1217b, self.twiddles_im[7], x9m20);
let m1217b = SseVector::fmadd(m1217b, self.twiddles_im[3], x10m19);
let m1217b = SseVector::nmadd(m1217b, self.twiddles_im[12], x11m18);
let m1217b = SseVector::nmadd(m1217b, self.twiddles_im[0], x12m17);
let m1217b = SseVector::fmadd(m1217b, self.twiddles_im[10], x13m16);
let m1217b = SseVector::nmadd(m1217b, self.twiddles_im[5], x14m15);
let [y12, y17] = SseVector::column_butterfly2([m1217a, m1217b]);
let m1316a = SseVector::fmadd(values[0], self.twiddles_re[12], x1p28);
let m1316a = SseVector::fmadd(m1316a, self.twiddles_re[2], x2p27);
let m1316a = SseVector::fmadd(m1316a, self.twiddles_re[9], x3p26);
let m1316a = SseVector::fmadd(m1316a, self.twiddles_re[5], x4p25);
let m1316a = SseVector::fmadd(m1316a, self.twiddles_re[6], x5p24);
let m1316a = SseVector::fmadd(m1316a, self.twiddles_re[8], x6p23);
let m1316a = SseVector::fmadd(m1316a, self.twiddles_re[3], x7p22);
let m1316a = SseVector::fmadd(m1316a, self.twiddles_re[11], x8p21);
let m1316a = SseVector::fmadd(m1316a, self.twiddles_re[0], x9p20);
let m1316a = SseVector::fmadd(m1316a, self.twiddles_re[13], x10p19);
let m1316a = SseVector::fmadd(m1316a, self.twiddles_re[1], x11p18);
let m1316a = SseVector::fmadd(m1316a, self.twiddles_re[10], x12p17);
let m1316a = SseVector::fmadd(m1316a, self.twiddles_re[4], x13p16);
let m1316a = SseVector::fmadd(m1316a, self.twiddles_re[7], x14p15);
let m1316b = SseVector::mul(self.twiddles_im[12], x1m28);
let m1316b = SseVector::nmadd(m1316b, self.twiddles_im[2], x2m27);
let m1316b = SseVector::fmadd(m1316b, self.twiddles_im[9], x3m26);
let m1316b = SseVector::nmadd(m1316b, self.twiddles_im[5], x4m25);
let m1316b = SseVector::fmadd(m1316b, self.twiddles_im[6], x5m24);
let m1316b = SseVector::nmadd(m1316b, self.twiddles_im[8], x6m23);
let m1316b = SseVector::fmadd(m1316b, self.twiddles_im[3], x7m22);
let m1316b = SseVector::nmadd(m1316b, self.twiddles_im[11], x8m21);
let m1316b = SseVector::fmadd(m1316b, self.twiddles_im[0], x9m20);
let m1316b = SseVector::fmadd(m1316b, self.twiddles_im[13], x10m19);
let m1316b = SseVector::nmadd(m1316b, self.twiddles_im[1], x11m18);
let m1316b = SseVector::fmadd(m1316b, self.twiddles_im[10], x12m17);
let m1316b = SseVector::nmadd(m1316b, self.twiddles_im[4], x13m16);
let m1316b = SseVector::fmadd(m1316b, self.twiddles_im[7], x14m15);
let [y13, y16] = SseVector::column_butterfly2([m1316a, m1316b]);
let m1415a = SseVector::fmadd(values[0], self.twiddles_re[13], x1p28);
let m1415a = SseVector::fmadd(m1415a, self.twiddles_re[0], x2p27);
let m1415a = SseVector::fmadd(m1415a, self.twiddles_re[12], x3p26);
let m1415a = SseVector::fmadd(m1415a, self.twiddles_re[1], x4p25);
let m1415a = SseVector::fmadd(m1415a, self.twiddles_re[11], x5p24);
let m1415a = SseVector::fmadd(m1415a, self.twiddles_re[2], x6p23);
let m1415a = SseVector::fmadd(m1415a, self.twiddles_re[10], x7p22);
let m1415a = SseVector::fmadd(m1415a, self.twiddles_re[3], x8p21);
let m1415a = SseVector::fmadd(m1415a, self.twiddles_re[9], x9p20);
let m1415a = SseVector::fmadd(m1415a, self.twiddles_re[4], x10p19);
let m1415a = SseVector::fmadd(m1415a, self.twiddles_re[8], x11p18);
let m1415a = SseVector::fmadd(m1415a, self.twiddles_re[5], x12p17);
let m1415a = SseVector::fmadd(m1415a, self.twiddles_re[7], x13p16);
let m1415a = SseVector::fmadd(m1415a, self.twiddles_re[6], x14p15);
let m1415b = SseVector::mul(self.twiddles_im[13], x1m28);
let m1415b = SseVector::nmadd(m1415b, self.twiddles_im[0], x2m27);
let m1415b = SseVector::fmadd(m1415b, self.twiddles_im[12], x3m26);
let m1415b = SseVector::nmadd(m1415b, self.twiddles_im[1], x4m25);
let m1415b = SseVector::fmadd(m1415b, self.twiddles_im[11], x5m24);
let m1415b = SseVector::nmadd(m1415b, self.twiddles_im[2], x6m23);
let m1415b = SseVector::fmadd(m1415b, self.twiddles_im[10], x7m22);
let m1415b = SseVector::nmadd(m1415b, self.twiddles_im[3], x8m21);
let m1415b = SseVector::fmadd(m1415b, self.twiddles_im[9], x9m20);
let m1415b = SseVector::nmadd(m1415b, self.twiddles_im[4], x10m19);
let m1415b = SseVector::fmadd(m1415b, self.twiddles_im[8], x11m18);
let m1415b = SseVector::nmadd(m1415b, self.twiddles_im[5], x12m17);
let m1415b = SseVector::fmadd(m1415b, self.twiddles_im[7], x13m16);
let m1415b = SseVector::nmadd(m1415b, self.twiddles_im[6], x14m15);
let [y14, y15] = SseVector::column_butterfly2([m1415a, m1415b]);
[y00, y01, y02, y03, y04, y05, y06, y07, y08, y09, y10, y11, y12, y13, y14, y15, y16, y17, y18, y19, y20, y21, y22, y23, y24, y25, y26, y27, y28]
}
}
struct SseF64Butterfly29<T> {
direction: FftDirection,
twiddles_re: [__m128d; 14],
twiddles_im: [__m128d; 14],
_phantom: std::marker::PhantomData<T>,
}
boilerplate_fft_sse_f64_butterfly!(SseF64Butterfly29, 29, |this: &SseF64Butterfly29<_>| this.direction);
impl<T: FftNum> SseF64Butterfly29<T> {
#[target_feature(enable = "sse4.1")]
unsafe fn new(direction: FftDirection) -> Self {
assert_f64::<T>();
let twiddles = make_twiddles(29, direction);
unsafe {Self {
direction,
twiddles_re: twiddles.map(|t| SseVector::broadcast_scalar(t.re)),
twiddles_im: twiddles.map(|t| SseVector::broadcast_scalar(t.im)),
_phantom: std::marker::PhantomData,
}}
}
#[inline(always)]
pub(crate) unsafe fn perform_fft_contiguous(&self, mut buffer: impl SseArrayMut<f64>) {
let values = read_complex_to_array!(buffer, { 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28 });
let out = self.perform_fft_direct(values);
write_complex_to_array!(out, buffer, { 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28 });
}
#[inline(always)]
pub(crate) unsafe fn perform_fft_direct(&self, values: [__m128d; 29]) -> [__m128d; 29] {
let rotate = SseVector::make_rotate90(FftDirection::Inverse);
let y00 = values[0];
let [x1p28, x1m28] = SseVector::column_butterfly2([values[1], values[28]]);
let x1m28 = SseVector::apply_rotate90(rotate, x1m28);
let y00 = SseVector::add(y00, x1p28);
let [x2p27, x2m27] = SseVector::column_butterfly2([values[2], values[27]]);
let x2m27 = SseVector::apply_rotate90(rotate, x2m27);
let y00 = SseVector::add(y00, x2p27);
let [x3p26, x3m26] = SseVector::column_butterfly2([values[3], values[26]]);
let x3m26 = SseVector::apply_rotate90(rotate, x3m26);
let y00 = SseVector::add(y00, x3p26);
let [x4p25, x4m25] = SseVector::column_butterfly2([values[4], values[25]]);
let x4m25 = SseVector::apply_rotate90(rotate, x4m25);
let y00 = SseVector::add(y00, x4p25);
let [x5p24, x5m24] = SseVector::column_butterfly2([values[5], values[24]]);
let x5m24 = SseVector::apply_rotate90(rotate, x5m24);
let y00 = SseVector::add(y00, x5p24);
let [x6p23, x6m23] = SseVector::column_butterfly2([values[6], values[23]]);
let x6m23 = SseVector::apply_rotate90(rotate, x6m23);
let y00 = SseVector::add(y00, x6p23);
let [x7p22, x7m22] = SseVector::column_butterfly2([values[7], values[22]]);
let x7m22 = SseVector::apply_rotate90(rotate, x7m22);
let y00 = SseVector::add(y00, x7p22);
let [x8p21, x8m21] = SseVector::column_butterfly2([values[8], values[21]]);
let x8m21 = SseVector::apply_rotate90(rotate, x8m21);
let y00 = SseVector::add(y00, x8p21);
let [x9p20, x9m20] = SseVector::column_butterfly2([values[9], values[20]]);
let x9m20 = SseVector::apply_rotate90(rotate, x9m20);
let y00 = SseVector::add(y00, x9p20);
let [x10p19, x10m19] = SseVector::column_butterfly2([values[10], values[19]]);
let x10m19 = SseVector::apply_rotate90(rotate, x10m19);
let y00 = SseVector::add(y00, x10p19);
let [x11p18, x11m18] = SseVector::column_butterfly2([values[11], values[18]]);
let x11m18 = SseVector::apply_rotate90(rotate, x11m18);
let y00 = SseVector::add(y00, x11p18);
let [x12p17, x12m17] = SseVector::column_butterfly2([values[12], values[17]]);
let x12m17 = SseVector::apply_rotate90(rotate, x12m17);
let y00 = SseVector::add(y00, x12p17);
let [x13p16, x13m16] = SseVector::column_butterfly2([values[13], values[16]]);
let x13m16 = SseVector::apply_rotate90(rotate, x13m16);
let y00 = SseVector::add(y00, x13p16);
let [x14p15, x14m15] = SseVector::column_butterfly2([values[14], values[15]]);
let x14m15 = SseVector::apply_rotate90(rotate, x14m15);
let y00 = SseVector::add(y00, x14p15);
let m0128a = SseVector::fmadd(values[0], self.twiddles_re[0], x1p28);
let m0128a = SseVector::fmadd(m0128a, self.twiddles_re[1], x2p27);
let m0128a = SseVector::fmadd(m0128a, self.twiddles_re[2], x3p26);
let m0128a = SseVector::fmadd(m0128a, self.twiddles_re[3], x4p25);
let m0128a = SseVector::fmadd(m0128a, self.twiddles_re[4], x5p24);
let m0128a = SseVector::fmadd(m0128a, self.twiddles_re[5], x6p23);
let m0128a = SseVector::fmadd(m0128a, self.twiddles_re[6], x7p22);
let m0128a = SseVector::fmadd(m0128a, self.twiddles_re[7], x8p21);
let m0128a = SseVector::fmadd(m0128a, self.twiddles_re[8], x9p20);
let m0128a = SseVector::fmadd(m0128a, self.twiddles_re[9], x10p19);
let m0128a = SseVector::fmadd(m0128a, self.twiddles_re[10], x11p18);
let m0128a = SseVector::fmadd(m0128a, self.twiddles_re[11], x12p17);
let m0128a = SseVector::fmadd(m0128a, self.twiddles_re[12], x13p16);
let m0128a = SseVector::fmadd(m0128a, self.twiddles_re[13], x14p15);
let m0128b = SseVector::mul(self.twiddles_im[0], x1m28);
let m0128b = SseVector::fmadd(m0128b, self.twiddles_im[1], x2m27);
let m0128b = SseVector::fmadd(m0128b, self.twiddles_im[2], x3m26);
let m0128b = SseVector::fmadd(m0128b, self.twiddles_im[3], x4m25);
let m0128b = SseVector::fmadd(m0128b, self.twiddles_im[4], x5m24);
let m0128b = SseVector::fmadd(m0128b, self.twiddles_im[5], x6m23);
let m0128b = SseVector::fmadd(m0128b, self.twiddles_im[6], x7m22);
let m0128b = SseVector::fmadd(m0128b, self.twiddles_im[7], x8m21);
let m0128b = SseVector::fmadd(m0128b, self.twiddles_im[8], x9m20);
let m0128b = SseVector::fmadd(m0128b, self.twiddles_im[9], x10m19);
let m0128b = SseVector::fmadd(m0128b, self.twiddles_im[10], x11m18);
let m0128b = SseVector::fmadd(m0128b, self.twiddles_im[11], x12m17);
let m0128b = SseVector::fmadd(m0128b, self.twiddles_im[12], x13m16);
let m0128b = SseVector::fmadd(m0128b, self.twiddles_im[13], x14m15);
let [y01, y28] = SseVector::column_butterfly2([m0128a, m0128b]);
let m0227a = SseVector::fmadd(values[0], self.twiddles_re[1], x1p28);
let m0227a = SseVector::fmadd(m0227a, self.twiddles_re[3], x2p27);
let m0227a = SseVector::fmadd(m0227a, self.twiddles_re[5], x3p26);
let m0227a = SseVector::fmadd(m0227a, self.twiddles_re[7], x4p25);
let m0227a = SseVector::fmadd(m0227a, self.twiddles_re[9], x5p24);
let m0227a = SseVector::fmadd(m0227a, self.twiddles_re[11], x6p23);
let m0227a = SseVector::fmadd(m0227a, self.twiddles_re[13], x7p22);
let m0227a = SseVector::fmadd(m0227a, self.twiddles_re[12], x8p21);
let m0227a = SseVector::fmadd(m0227a, self.twiddles_re[10], x9p20);
let m0227a = SseVector::fmadd(m0227a, self.twiddles_re[8], x10p19);
let m0227a = SseVector::fmadd(m0227a, self.twiddles_re[6], x11p18);
let m0227a = SseVector::fmadd(m0227a, self.twiddles_re[4], x12p17);
let m0227a = SseVector::fmadd(m0227a, self.twiddles_re[2], x13p16);
let m0227a = SseVector::fmadd(m0227a, self.twiddles_re[0], x14p15);
let m0227b = SseVector::mul(self.twiddles_im[1], x1m28);
let m0227b = SseVector::fmadd(m0227b, self.twiddles_im[3], x2m27);
let m0227b = SseVector::fmadd(m0227b, self.twiddles_im[5], x3m26);
let m0227b = SseVector::fmadd(m0227b, self.twiddles_im[7], x4m25);
let m0227b = SseVector::fmadd(m0227b, self.twiddles_im[9], x5m24);
let m0227b = SseVector::fmadd(m0227b, self.twiddles_im[11], x6m23);
let m0227b = SseVector::fmadd(m0227b, self.twiddles_im[13], x7m22);
let m0227b = SseVector::nmadd(m0227b, self.twiddles_im[12], x8m21);
let m0227b = SseVector::nmadd(m0227b, self.twiddles_im[10], x9m20);
let m0227b = SseVector::nmadd(m0227b, self.twiddles_im[8], x10m19);
let m0227b = SseVector::nmadd(m0227b, self.twiddles_im[6], x11m18);
let m0227b = SseVector::nmadd(m0227b, self.twiddles_im[4], x12m17);
let m0227b = SseVector::nmadd(m0227b, self.twiddles_im[2], x13m16);
let m0227b = SseVector::nmadd(m0227b, self.twiddles_im[0], x14m15);
let [y02, y27] = SseVector::column_butterfly2([m0227a, m0227b]);
let m0326a = SseVector::fmadd(values[0], self.twiddles_re[2], x1p28);
let m0326a = SseVector::fmadd(m0326a, self.twiddles_re[5], x2p27);
let m0326a = SseVector::fmadd(m0326a, self.twiddles_re[8], x3p26);
let m0326a = SseVector::fmadd(m0326a, self.twiddles_re[11], x4p25);
let m0326a = SseVector::fmadd(m0326a, self.twiddles_re[13], x5p24);
let m0326a = SseVector::fmadd(m0326a, self.twiddles_re[10], x6p23);
let m0326a = SseVector::fmadd(m0326a, self.twiddles_re[7], x7p22);
let m0326a = SseVector::fmadd(m0326a, self.twiddles_re[4], x8p21);
let m0326a = SseVector::fmadd(m0326a, self.twiddles_re[1], x9p20);
let m0326a = SseVector::fmadd(m0326a, self.twiddles_re[0], x10p19);
let m0326a = SseVector::fmadd(m0326a, self.twiddles_re[3], x11p18);
let m0326a = SseVector::fmadd(m0326a, self.twiddles_re[6], x12p17);
let m0326a = SseVector::fmadd(m0326a, self.twiddles_re[9], x13p16);
let m0326a = SseVector::fmadd(m0326a, self.twiddles_re[12], x14p15);
let m0326b = SseVector::mul(self.twiddles_im[2], x1m28);
let m0326b = SseVector::fmadd(m0326b, self.twiddles_im[5], x2m27);
let m0326b = SseVector::fmadd(m0326b, self.twiddles_im[8], x3m26);
let m0326b = SseVector::fmadd(m0326b, self.twiddles_im[11], x4m25);
let m0326b = SseVector::nmadd(m0326b, self.twiddles_im[13], x5m24);
let m0326b = SseVector::nmadd(m0326b, self.twiddles_im[10], x6m23);
let m0326b = SseVector::nmadd(m0326b, self.twiddles_im[7], x7m22);
let m0326b = SseVector::nmadd(m0326b, self.twiddles_im[4], x8m21);
let m0326b = SseVector::nmadd(m0326b, self.twiddles_im[1], x9m20);
let m0326b = SseVector::fmadd(m0326b, self.twiddles_im[0], x10m19);
let m0326b = SseVector::fmadd(m0326b, self.twiddles_im[3], x11m18);
let m0326b = SseVector::fmadd(m0326b, self.twiddles_im[6], x12m17);
let m0326b = SseVector::fmadd(m0326b, self.twiddles_im[9], x13m16);
let m0326b = SseVector::fmadd(m0326b, self.twiddles_im[12], x14m15);
let [y03, y26] = SseVector::column_butterfly2([m0326a, m0326b]);
let m0425a = SseVector::fmadd(values[0], self.twiddles_re[3], x1p28);
let m0425a = SseVector::fmadd(m0425a, self.twiddles_re[7], x2p27);
let m0425a = SseVector::fmadd(m0425a, self.twiddles_re[11], x3p26);
let m0425a = SseVector::fmadd(m0425a, self.twiddles_re[12], x4p25);
let m0425a = SseVector::fmadd(m0425a, self.twiddles_re[8], x5p24);
let m0425a = SseVector::fmadd(m0425a, self.twiddles_re[4], x6p23);
let m0425a = SseVector::fmadd(m0425a, self.twiddles_re[0], x7p22);
let m0425a = SseVector::fmadd(m0425a, self.twiddles_re[2], x8p21);
let m0425a = SseVector::fmadd(m0425a, self.twiddles_re[6], x9p20);
let m0425a = SseVector::fmadd(m0425a, self.twiddles_re[10], x10p19);
let m0425a = SseVector::fmadd(m0425a, self.twiddles_re[13], x11p18);
let m0425a = SseVector::fmadd(m0425a, self.twiddles_re[9], x12p17);
let m0425a = SseVector::fmadd(m0425a, self.twiddles_re[5], x13p16);
let m0425a = SseVector::fmadd(m0425a, self.twiddles_re[1], x14p15);
let m0425b = SseVector::mul(self.twiddles_im[3], x1m28);
let m0425b = SseVector::fmadd(m0425b, self.twiddles_im[7], x2m27);
let m0425b = SseVector::fmadd(m0425b, self.twiddles_im[11], x3m26);
let m0425b = SseVector::nmadd(m0425b, self.twiddles_im[12], x4m25);
let m0425b = SseVector::nmadd(m0425b, self.twiddles_im[8], x5m24);
let m0425b = SseVector::nmadd(m0425b, self.twiddles_im[4], x6m23);
let m0425b = SseVector::nmadd(m0425b, self.twiddles_im[0], x7m22);
let m0425b = SseVector::fmadd(m0425b, self.twiddles_im[2], x8m21);
let m0425b = SseVector::fmadd(m0425b, self.twiddles_im[6], x9m20);
let m0425b = SseVector::fmadd(m0425b, self.twiddles_im[10], x10m19);
let m0425b = SseVector::nmadd(m0425b, self.twiddles_im[13], x11m18);
let m0425b = SseVector::nmadd(m0425b, self.twiddles_im[9], x12m17);
let m0425b = SseVector::nmadd(m0425b, self.twiddles_im[5], x13m16);
let m0425b = SseVector::nmadd(m0425b, self.twiddles_im[1], x14m15);
let [y04, y25] = SseVector::column_butterfly2([m0425a, m0425b]);
let m0524a = SseVector::fmadd(values[0], self.twiddles_re[4], x1p28);
let m0524a = SseVector::fmadd(m0524a, self.twiddles_re[9], x2p27);
let m0524a = SseVector::fmadd(m0524a, self.twiddles_re[13], x3p26);
let m0524a = SseVector::fmadd(m0524a, self.twiddles_re[8], x4p25);
let m0524a = SseVector::fmadd(m0524a, self.twiddles_re[3], x5p24);
let m0524a = SseVector::fmadd(m0524a, self.twiddles_re[0], x6p23);
let m0524a = SseVector::fmadd(m0524a, self.twiddles_re[5], x7p22);
let m0524a = SseVector::fmadd(m0524a, self.twiddles_re[10], x8p21);
let m0524a = SseVector::fmadd(m0524a, self.twiddles_re[12], x9p20);
let m0524a = SseVector::fmadd(m0524a, self.twiddles_re[7], x10p19);
let m0524a = SseVector::fmadd(m0524a, self.twiddles_re[2], x11p18);
let m0524a = SseVector::fmadd(m0524a, self.twiddles_re[1], x12p17);
let m0524a = SseVector::fmadd(m0524a, self.twiddles_re[6], x13p16);
let m0524a = SseVector::fmadd(m0524a, self.twiddles_re[11], x14p15);
let m0524b = SseVector::mul(self.twiddles_im[4], x1m28);
let m0524b = SseVector::fmadd(m0524b, self.twiddles_im[9], x2m27);
let m0524b = SseVector::nmadd(m0524b, self.twiddles_im[13], x3m26);
let m0524b = SseVector::nmadd(m0524b, self.twiddles_im[8], x4m25);
let m0524b = SseVector::nmadd(m0524b, self.twiddles_im[3], x5m24);
let m0524b = SseVector::fmadd(m0524b, self.twiddles_im[0], x6m23);
let m0524b = SseVector::fmadd(m0524b, self.twiddles_im[5], x7m22);
let m0524b = SseVector::fmadd(m0524b, self.twiddles_im[10], x8m21);
let m0524b = SseVector::nmadd(m0524b, self.twiddles_im[12], x9m20);
let m0524b = SseVector::nmadd(m0524b, self.twiddles_im[7], x10m19);
let m0524b = SseVector::nmadd(m0524b, self.twiddles_im[2], x11m18);
let m0524b = SseVector::fmadd(m0524b, self.twiddles_im[1], x12m17);
let m0524b = SseVector::fmadd(m0524b, self.twiddles_im[6], x13m16);
let m0524b = SseVector::fmadd(m0524b, self.twiddles_im[11], x14m15);
let [y05, y24] = SseVector::column_butterfly2([m0524a, m0524b]);
let m0623a = SseVector::fmadd(values[0], self.twiddles_re[5], x1p28);
let m0623a = SseVector::fmadd(m0623a, self.twiddles_re[11], x2p27);
let m0623a = SseVector::fmadd(m0623a, self.twiddles_re[10], x3p26);
let m0623a = SseVector::fmadd(m0623a, self.twiddles_re[4], x4p25);
let m0623a = SseVector::fmadd(m0623a, self.twiddles_re[0], x5p24);
let m0623a = SseVector::fmadd(m0623a, self.twiddles_re[6], x6p23);
let m0623a = SseVector::fmadd(m0623a, self.twiddles_re[12], x7p22);
let m0623a = SseVector::fmadd(m0623a, self.twiddles_re[9], x8p21);
let m0623a = SseVector::fmadd(m0623a, self.twiddles_re[3], x9p20);
let m0623a = SseVector::fmadd(m0623a, self.twiddles_re[1], x10p19);
let m0623a = SseVector::fmadd(m0623a, self.twiddles_re[7], x11p18);
let m0623a = SseVector::fmadd(m0623a, self.twiddles_re[13], x12p17);
let m0623a = SseVector::fmadd(m0623a, self.twiddles_re[8], x13p16);
let m0623a = SseVector::fmadd(m0623a, self.twiddles_re[2], x14p15);
let m0623b = SseVector::mul(self.twiddles_im[5], x1m28);
let m0623b = SseVector::fmadd(m0623b, self.twiddles_im[11], x2m27);
let m0623b = SseVector::nmadd(m0623b, self.twiddles_im[10], x3m26);
let m0623b = SseVector::nmadd(m0623b, self.twiddles_im[4], x4m25);
let m0623b = SseVector::fmadd(m0623b, self.twiddles_im[0], x5m24);
let m0623b = SseVector::fmadd(m0623b, self.twiddles_im[6], x6m23);
let m0623b = SseVector::fmadd(m0623b, self.twiddles_im[12], x7m22);
let m0623b = SseVector::nmadd(m0623b, self.twiddles_im[9], x8m21);
let m0623b = SseVector::nmadd(m0623b, self.twiddles_im[3], x9m20);
let m0623b = SseVector::fmadd(m0623b, self.twiddles_im[1], x10m19);
let m0623b = SseVector::fmadd(m0623b, self.twiddles_im[7], x11m18);
let m0623b = SseVector::fmadd(m0623b, self.twiddles_im[13], x12m17);
let m0623b = SseVector::nmadd(m0623b, self.twiddles_im[8], x13m16);
let m0623b = SseVector::nmadd(m0623b, self.twiddles_im[2], x14m15);
let [y06, y23] = SseVector::column_butterfly2([m0623a, m0623b]);
let m0722a = SseVector::fmadd(values[0], self.twiddles_re[6], x1p28);
let m0722a = SseVector::fmadd(m0722a, self.twiddles_re[13], x2p27);
let m0722a = SseVector::fmadd(m0722a, self.twiddles_re[7], x3p26);
let m0722a = SseVector::fmadd(m0722a, self.twiddles_re[0], x4p25);
let m0722a = SseVector::fmadd(m0722a, self.twiddles_re[5], x5p24);
let m0722a = SseVector::fmadd(m0722a, self.twiddles_re[12], x6p23);
let m0722a = SseVector::fmadd(m0722a, self.twiddles_re[8], x7p22);
let m0722a = SseVector::fmadd(m0722a, self.twiddles_re[1], x8p21);
let m0722a = SseVector::fmadd(m0722a, self.twiddles_re[4], x9p20);
let m0722a = SseVector::fmadd(m0722a, self.twiddles_re[11], x10p19);
let m0722a = SseVector::fmadd(m0722a, self.twiddles_re[9], x11p18);
let m0722a = SseVector::fmadd(m0722a, self.twiddles_re[2], x12p17);
let m0722a = SseVector::fmadd(m0722a, self.twiddles_re[3], x13p16);
let m0722a = SseVector::fmadd(m0722a, self.twiddles_re[10], x14p15);
let m0722b = SseVector::mul(self.twiddles_im[6], x1m28);
let m0722b = SseVector::fmadd(m0722b, self.twiddles_im[13], x2m27);
let m0722b = SseVector::nmadd(m0722b, self.twiddles_im[7], x3m26);
let m0722b = SseVector::nmadd(m0722b, self.twiddles_im[0], x4m25);
let m0722b = SseVector::fmadd(m0722b, self.twiddles_im[5], x5m24);
let m0722b = SseVector::fmadd(m0722b, self.twiddles_im[12], x6m23);
let m0722b = SseVector::nmadd(m0722b, self.twiddles_im[8], x7m22);
let m0722b = SseVector::nmadd(m0722b, self.twiddles_im[1], x8m21);
let m0722b = SseVector::fmadd(m0722b, self.twiddles_im[4], x9m20);
let m0722b = SseVector::fmadd(m0722b, self.twiddles_im[11], x10m19);
let m0722b = SseVector::nmadd(m0722b, self.twiddles_im[9], x11m18);
let m0722b = SseVector::nmadd(m0722b, self.twiddles_im[2], x12m17);
let m0722b = SseVector::fmadd(m0722b, self.twiddles_im[3], x13m16);
let m0722b = SseVector::fmadd(m0722b, self.twiddles_im[10], x14m15);
let [y07, y22] = SseVector::column_butterfly2([m0722a, m0722b]);
let m0821a = SseVector::fmadd(values[0], self.twiddles_re[7], x1p28);
let m0821a = SseVector::fmadd(m0821a, self.twiddles_re[12], x2p27);
let m0821a = SseVector::fmadd(m0821a, self.twiddles_re[4], x3p26);
let m0821a = SseVector::fmadd(m0821a, self.twiddles_re[2], x4p25);
let m0821a = SseVector::fmadd(m0821a, self.twiddles_re[10], x5p24);
let m0821a = SseVector::fmadd(m0821a, self.twiddles_re[9], x6p23);
let m0821a = SseVector::fmadd(m0821a, self.twiddles_re[1], x7p22);
let m0821a = SseVector::fmadd(m0821a, self.twiddles_re[5], x8p21);
let m0821a = SseVector::fmadd(m0821a, self.twiddles_re[13], x9p20);
let m0821a = SseVector::fmadd(m0821a, self.twiddles_re[6], x10p19);
let m0821a = SseVector::fmadd(m0821a, self.twiddles_re[0], x11p18);
let m0821a = SseVector::fmadd(m0821a, self.twiddles_re[8], x12p17);
let m0821a = SseVector::fmadd(m0821a, self.twiddles_re[11], x13p16);
let m0821a = SseVector::fmadd(m0821a, self.twiddles_re[3], x14p15);
let m0821b = SseVector::mul(self.twiddles_im[7], x1m28);
let m0821b = SseVector::nmadd(m0821b, self.twiddles_im[12], x2m27);
let m0821b = SseVector::nmadd(m0821b, self.twiddles_im[4], x3m26);
let m0821b = SseVector::fmadd(m0821b, self.twiddles_im[2], x4m25);
let m0821b = SseVector::fmadd(m0821b, self.twiddles_im[10], x5m24);
let m0821b = SseVector::nmadd(m0821b, self.twiddles_im[9], x6m23);
let m0821b = SseVector::nmadd(m0821b, self.twiddles_im[1], x7m22);
let m0821b = SseVector::fmadd(m0821b, self.twiddles_im[5], x8m21);
let m0821b = SseVector::fmadd(m0821b, self.twiddles_im[13], x9m20);
let m0821b = SseVector::nmadd(m0821b, self.twiddles_im[6], x10m19);
let m0821b = SseVector::fmadd(m0821b, self.twiddles_im[0], x11m18);
let m0821b = SseVector::fmadd(m0821b, self.twiddles_im[8], x12m17);
let m0821b = SseVector::nmadd(m0821b, self.twiddles_im[11], x13m16);
let m0821b = SseVector::nmadd(m0821b, self.twiddles_im[3], x14m15);
let [y08, y21] = SseVector::column_butterfly2([m0821a, m0821b]);
let m0920a = SseVector::fmadd(values[0], self.twiddles_re[8], x1p28);
let m0920a = SseVector::fmadd(m0920a, self.twiddles_re[10], x2p27);
let m0920a = SseVector::fmadd(m0920a, self.twiddles_re[1], x3p26);
let m0920a = SseVector::fmadd(m0920a, self.twiddles_re[6], x4p25);
let m0920a = SseVector::fmadd(m0920a, self.twiddles_re[12], x5p24);
let m0920a = SseVector::fmadd(m0920a, self.twiddles_re[3], x6p23);
let m0920a = SseVector::fmadd(m0920a, self.twiddles_re[4], x7p22);
let m0920a = SseVector::fmadd(m0920a, self.twiddles_re[13], x8p21);
let m0920a = SseVector::fmadd(m0920a, self.twiddles_re[5], x9p20);
let m0920a = SseVector::fmadd(m0920a, self.twiddles_re[2], x10p19);
let m0920a = SseVector::fmadd(m0920a, self.twiddles_re[11], x11p18);
let m0920a = SseVector::fmadd(m0920a, self.twiddles_re[7], x12p17);
let m0920a = SseVector::fmadd(m0920a, self.twiddles_re[0], x13p16);
let m0920a = SseVector::fmadd(m0920a, self.twiddles_re[9], x14p15);
let m0920b = SseVector::mul(self.twiddles_im[8], x1m28);
let m0920b = SseVector::nmadd(m0920b, self.twiddles_im[10], x2m27);
let m0920b = SseVector::nmadd(m0920b, self.twiddles_im[1], x3m26);
let m0920b = SseVector::fmadd(m0920b, self.twiddles_im[6], x4m25);
let m0920b = SseVector::nmadd(m0920b, self.twiddles_im[12], x5m24);
let m0920b = SseVector::nmadd(m0920b, self.twiddles_im[3], x6m23);
let m0920b = SseVector::fmadd(m0920b, self.twiddles_im[4], x7m22);
let m0920b = SseVector::fmadd(m0920b, self.twiddles_im[13], x8m21);
let m0920b = SseVector::nmadd(m0920b, self.twiddles_im[5], x9m20);
let m0920b = SseVector::fmadd(m0920b, self.twiddles_im[2], x10m19);
let m0920b = SseVector::fmadd(m0920b, self.twiddles_im[11], x11m18);
let m0920b = SseVector::nmadd(m0920b, self.twiddles_im[7], x12m17);
let m0920b = SseVector::fmadd(m0920b, self.twiddles_im[0], x13m16);
let m0920b = SseVector::fmadd(m0920b, self.twiddles_im[9], x14m15);
let [y09, y20] = SseVector::column_butterfly2([m0920a, m0920b]);
let m1019a = SseVector::fmadd(values[0], self.twiddles_re[9], x1p28);
let m1019a = SseVector::fmadd(m1019a, self.twiddles_re[8], x2p27);
let m1019a = SseVector::fmadd(m1019a, self.twiddles_re[0], x3p26);
let m1019a = SseVector::fmadd(m1019a, self.twiddles_re[10], x4p25);
let m1019a = SseVector::fmadd(m1019a, self.twiddles_re[7], x5p24);
let m1019a = SseVector::fmadd(m1019a, self.twiddles_re[1], x6p23);
let m1019a = SseVector::fmadd(m1019a, self.twiddles_re[11], x7p22);
let m1019a = SseVector::fmadd(m1019a, self.twiddles_re[6], x8p21);
let m1019a = SseVector::fmadd(m1019a, self.twiddles_re[2], x9p20);
let m1019a = SseVector::fmadd(m1019a, self.twiddles_re[12], x10p19);
let m1019a = SseVector::fmadd(m1019a, self.twiddles_re[5], x11p18);
let m1019a = SseVector::fmadd(m1019a, self.twiddles_re[3], x12p17);
let m1019a = SseVector::fmadd(m1019a, self.twiddles_re[13], x13p16);
let m1019a = SseVector::fmadd(m1019a, self.twiddles_re[4], x14p15);
let m1019b = SseVector::mul(self.twiddles_im[9], x1m28);
let m1019b = SseVector::nmadd(m1019b, self.twiddles_im[8], x2m27);
let m1019b = SseVector::fmadd(m1019b, self.twiddles_im[0], x3m26);
let m1019b = SseVector::fmadd(m1019b, self.twiddles_im[10], x4m25);
let m1019b = SseVector::nmadd(m1019b, self.twiddles_im[7], x5m24);
let m1019b = SseVector::fmadd(m1019b, self.twiddles_im[1], x6m23);
let m1019b = SseVector::fmadd(m1019b, self.twiddles_im[11], x7m22);
let m1019b = SseVector::nmadd(m1019b, self.twiddles_im[6], x8m21);
let m1019b = SseVector::fmadd(m1019b, self.twiddles_im[2], x9m20);
let m1019b = SseVector::fmadd(m1019b, self.twiddles_im[12], x10m19);
let m1019b = SseVector::nmadd(m1019b, self.twiddles_im[5], x11m18);
let m1019b = SseVector::fmadd(m1019b, self.twiddles_im[3], x12m17);
let m1019b = SseVector::fmadd(m1019b, self.twiddles_im[13], x13m16);
let m1019b = SseVector::nmadd(m1019b, self.twiddles_im[4], x14m15);
let [y10, y19] = SseVector::column_butterfly2([m1019a, m1019b]);
let m1118a = SseVector::fmadd(values[0], self.twiddles_re[10], x1p28);
let m1118a = SseVector::fmadd(m1118a, self.twiddles_re[6], x2p27);
let m1118a = SseVector::fmadd(m1118a, self.twiddles_re[3], x3p26);
let m1118a = SseVector::fmadd(m1118a, self.twiddles_re[13], x4p25);
let m1118a = SseVector::fmadd(m1118a, self.twiddles_re[2], x5p24);
let m1118a = SseVector::fmadd(m1118a, self.twiddles_re[7], x6p23);
let m1118a = SseVector::fmadd(m1118a, self.twiddles_re[9], x7p22);
let m1118a = SseVector::fmadd(m1118a, self.twiddles_re[0], x8p21);
let m1118a = SseVector::fmadd(m1118a, self.twiddles_re[11], x9p20);
let m1118a = SseVector::fmadd(m1118a, self.twiddles_re[5], x10p19);
let m1118a = SseVector::fmadd(m1118a, self.twiddles_re[4], x11p18);
let m1118a = SseVector::fmadd(m1118a, self.twiddles_re[12], x12p17);
let m1118a = SseVector::fmadd(m1118a, self.twiddles_re[1], x13p16);
let m1118a = SseVector::fmadd(m1118a, self.twiddles_re[8], x14p15);
let m1118b = SseVector::mul(self.twiddles_im[10], x1m28);
let m1118b = SseVector::nmadd(m1118b, self.twiddles_im[6], x2m27);
let m1118b = SseVector::fmadd(m1118b, self.twiddles_im[3], x3m26);
let m1118b = SseVector::nmadd(m1118b, self.twiddles_im[13], x4m25);
let m1118b = SseVector::nmadd(m1118b, self.twiddles_im[2], x5m24);
let m1118b = SseVector::fmadd(m1118b, self.twiddles_im[7], x6m23);
let m1118b = SseVector::nmadd(m1118b, self.twiddles_im[9], x7m22);
let m1118b = SseVector::fmadd(m1118b, self.twiddles_im[0], x8m21);
let m1118b = SseVector::fmadd(m1118b, self.twiddles_im[11], x9m20);
let m1118b = SseVector::nmadd(m1118b, self.twiddles_im[5], x10m19);
let m1118b = SseVector::fmadd(m1118b, self.twiddles_im[4], x11m18);
let m1118b = SseVector::nmadd(m1118b, self.twiddles_im[12], x12m17);
let m1118b = SseVector::nmadd(m1118b, self.twiddles_im[1], x13m16);
let m1118b = SseVector::fmadd(m1118b, self.twiddles_im[8], x14m15);
let [y11, y18] = SseVector::column_butterfly2([m1118a, m1118b]);
let m1217a = SseVector::fmadd(values[0], self.twiddles_re[11], x1p28);
let m1217a = SseVector::fmadd(m1217a, self.twiddles_re[4], x2p27);
let m1217a = SseVector::fmadd(m1217a, self.twiddles_re[6], x3p26);
let m1217a = SseVector::fmadd(m1217a, self.twiddles_re[9], x4p25);
let m1217a = SseVector::fmadd(m1217a, self.twiddles_re[1], x5p24);
let m1217a = SseVector::fmadd(m1217a, self.twiddles_re[13], x6p23);
let m1217a = SseVector::fmadd(m1217a, self.twiddles_re[2], x7p22);
let m1217a = SseVector::fmadd(m1217a, self.twiddles_re[8], x8p21);
let m1217a = SseVector::fmadd(m1217a, self.twiddles_re[7], x9p20);
let m1217a = SseVector::fmadd(m1217a, self.twiddles_re[3], x10p19);
let m1217a = SseVector::fmadd(m1217a, self.twiddles_re[12], x11p18);
let m1217a = SseVector::fmadd(m1217a, self.twiddles_re[0], x12p17);
let m1217a = SseVector::fmadd(m1217a, self.twiddles_re[10], x13p16);
let m1217a = SseVector::fmadd(m1217a, self.twiddles_re[5], x14p15);
let m1217b = SseVector::mul(self.twiddles_im[11], x1m28);
let m1217b = SseVector::nmadd(m1217b, self.twiddles_im[4], x2m27);
let m1217b = SseVector::fmadd(m1217b, self.twiddles_im[6], x3m26);
let m1217b = SseVector::nmadd(m1217b, self.twiddles_im[9], x4m25);
let m1217b = SseVector::fmadd(m1217b, self.twiddles_im[1], x5m24);
let m1217b = SseVector::fmadd(m1217b, self.twiddles_im[13], x6m23);
let m1217b = SseVector::nmadd(m1217b, self.twiddles_im[2], x7m22);
let m1217b = SseVector::fmadd(m1217b, self.twiddles_im[8], x8m21);
let m1217b = SseVector::nmadd(m1217b, self.twiddles_im[7], x9m20);
let m1217b = SseVector::fmadd(m1217b, self.twiddles_im[3], x10m19);
let m1217b = SseVector::nmadd(m1217b, self.twiddles_im[12], x11m18);
let m1217b = SseVector::nmadd(m1217b, self.twiddles_im[0], x12m17);
let m1217b = SseVector::fmadd(m1217b, self.twiddles_im[10], x13m16);
let m1217b = SseVector::nmadd(m1217b, self.twiddles_im[5], x14m15);
let [y12, y17] = SseVector::column_butterfly2([m1217a, m1217b]);
let m1316a = SseVector::fmadd(values[0], self.twiddles_re[12], x1p28);
let m1316a = SseVector::fmadd(m1316a, self.twiddles_re[2], x2p27);
let m1316a = SseVector::fmadd(m1316a, self.twiddles_re[9], x3p26);
let m1316a = SseVector::fmadd(m1316a, self.twiddles_re[5], x4p25);
let m1316a = SseVector::fmadd(m1316a, self.twiddles_re[6], x5p24);
let m1316a = SseVector::fmadd(m1316a, self.twiddles_re[8], x6p23);
let m1316a = SseVector::fmadd(m1316a, self.twiddles_re[3], x7p22);
let m1316a = SseVector::fmadd(m1316a, self.twiddles_re[11], x8p21);
let m1316a = SseVector::fmadd(m1316a, self.twiddles_re[0], x9p20);
let m1316a = SseVector::fmadd(m1316a, self.twiddles_re[13], x10p19);
let m1316a = SseVector::fmadd(m1316a, self.twiddles_re[1], x11p18);
let m1316a = SseVector::fmadd(m1316a, self.twiddles_re[10], x12p17);
let m1316a = SseVector::fmadd(m1316a, self.twiddles_re[4], x13p16);
let m1316a = SseVector::fmadd(m1316a, self.twiddles_re[7], x14p15);
let m1316b = SseVector::mul(self.twiddles_im[12], x1m28);
let m1316b = SseVector::nmadd(m1316b, self.twiddles_im[2], x2m27);
let m1316b = SseVector::fmadd(m1316b, self.twiddles_im[9], x3m26);
let m1316b = SseVector::nmadd(m1316b, self.twiddles_im[5], x4m25);
let m1316b = SseVector::fmadd(m1316b, self.twiddles_im[6], x5m24);
let m1316b = SseVector::nmadd(m1316b, self.twiddles_im[8], x6m23);
let m1316b = SseVector::fmadd(m1316b, self.twiddles_im[3], x7m22);
let m1316b = SseVector::nmadd(m1316b, self.twiddles_im[11], x8m21);
let m1316b = SseVector::fmadd(m1316b, self.twiddles_im[0], x9m20);
let m1316b = SseVector::fmadd(m1316b, self.twiddles_im[13], x10m19);
let m1316b = SseVector::nmadd(m1316b, self.twiddles_im[1], x11m18);
let m1316b = SseVector::fmadd(m1316b, self.twiddles_im[10], x12m17);
let m1316b = SseVector::nmadd(m1316b, self.twiddles_im[4], x13m16);
let m1316b = SseVector::fmadd(m1316b, self.twiddles_im[7], x14m15);
let [y13, y16] = SseVector::column_butterfly2([m1316a, m1316b]);
let m1415a = SseVector::fmadd(values[0], self.twiddles_re[13], x1p28);
let m1415a = SseVector::fmadd(m1415a, self.twiddles_re[0], x2p27);
let m1415a = SseVector::fmadd(m1415a, self.twiddles_re[12], x3p26);
let m1415a = SseVector::fmadd(m1415a, self.twiddles_re[1], x4p25);
let m1415a = SseVector::fmadd(m1415a, self.twiddles_re[11], x5p24);
let m1415a = SseVector::fmadd(m1415a, self.twiddles_re[2], x6p23);
let m1415a = SseVector::fmadd(m1415a, self.twiddles_re[10], x7p22);
let m1415a = SseVector::fmadd(m1415a, self.twiddles_re[3], x8p21);
let m1415a = SseVector::fmadd(m1415a, self.twiddles_re[9], x9p20);
let m1415a = SseVector::fmadd(m1415a, self.twiddles_re[4], x10p19);
let m1415a = SseVector::fmadd(m1415a, self.twiddles_re[8], x11p18);
let m1415a = SseVector::fmadd(m1415a, self.twiddles_re[5], x12p17);
let m1415a = SseVector::fmadd(m1415a, self.twiddles_re[7], x13p16);
let m1415a = SseVector::fmadd(m1415a, self.twiddles_re[6], x14p15);
let m1415b = SseVector::mul(self.twiddles_im[13], x1m28);
let m1415b = SseVector::nmadd(m1415b, self.twiddles_im[0], x2m27);
let m1415b = SseVector::fmadd(m1415b, self.twiddles_im[12], x3m26);
let m1415b = SseVector::nmadd(m1415b, self.twiddles_im[1], x4m25);
let m1415b = SseVector::fmadd(m1415b, self.twiddles_im[11], x5m24);
let m1415b = SseVector::nmadd(m1415b, self.twiddles_im[2], x6m23);
let m1415b = SseVector::fmadd(m1415b, self.twiddles_im[10], x7m22);
let m1415b = SseVector::nmadd(m1415b, self.twiddles_im[3], x8m21);
let m1415b = SseVector::fmadd(m1415b, self.twiddles_im[9], x9m20);
let m1415b = SseVector::nmadd(m1415b, self.twiddles_im[4], x10m19);
let m1415b = SseVector::fmadd(m1415b, self.twiddles_im[8], x11m18);
let m1415b = SseVector::nmadd(m1415b, self.twiddles_im[5], x12m17);
let m1415b = SseVector::fmadd(m1415b, self.twiddles_im[7], x13m16);
let m1415b = SseVector::nmadd(m1415b, self.twiddles_im[6], x14m15);
let [y14, y15] = SseVector::column_butterfly2([m1415a, m1415b]);
[y00, y01, y02, y03, y04, y05, y06, y07, y08, y09, y10, y11, y12, y13, y14, y15, y16, y17, y18, y19, y20, y21, y22, y23, y24, y25, y26, y27, y28]
}
}
struct SseF32Butterfly31<T> {
direction: FftDirection,
twiddles_re: [__m128; 15],
twiddles_im: [__m128; 15],
_phantom: std::marker::PhantomData<T>,
}
boilerplate_fft_sse_f32_butterfly!(SseF32Butterfly31, 31, |this: &SseF32Butterfly31<_>| this.direction);
impl<T: FftNum> SseF32Butterfly31<T> {
#[target_feature(enable = "sse4.1")]
unsafe fn new(direction: FftDirection) -> Self {
assert_f32::<T>();
let twiddles = make_twiddles(31, direction);
Self {
direction,
twiddles_re: twiddles.map(|t| SseVector::broadcast_scalar(t.re)),
twiddles_im: twiddles.map(|t| SseVector::broadcast_scalar(t.im)),
_phantom: std::marker::PhantomData,
}
}
#[inline(always)]
pub(crate) unsafe fn perform_fft_contiguous(&self, mut buffer: impl SseArrayMut<f32>) {
let values = read_partial1_complex_to_array!(buffer, { 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30 });
let out = self.perform_parallel_fft_direct(values);
write_partial_lo_complex_to_array!(out, buffer, { 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30 } );
}
#[inline(always)]
pub(crate) unsafe fn perform_parallel_fft_contiguous(&self, mut buffer: impl SseArrayMut<f32>) {
let input_packed = read_complex_to_array!(buffer, { 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56,58,60 });
let values = [
extract_lo_hi_f32(input_packed[0], input_packed[15]),
extract_hi_lo_f32(input_packed[0], input_packed[16]),
extract_lo_hi_f32(input_packed[1], input_packed[16]),
extract_hi_lo_f32(input_packed[1], input_packed[17]),
extract_lo_hi_f32(input_packed[2], input_packed[17]),
extract_hi_lo_f32(input_packed[2], input_packed[18]),
extract_lo_hi_f32(input_packed[3], input_packed[18]),
extract_hi_lo_f32(input_packed[3], input_packed[19]),
extract_lo_hi_f32(input_packed[4], input_packed[19]),
extract_hi_lo_f32(input_packed[4], input_packed[20]),
extract_lo_hi_f32(input_packed[5], input_packed[20]),
extract_hi_lo_f32(input_packed[5], input_packed[21]),
extract_lo_hi_f32(input_packed[6], input_packed[21]),
extract_hi_lo_f32(input_packed[6], input_packed[22]),
extract_lo_hi_f32(input_packed[7], input_packed[22]),
extract_hi_lo_f32(input_packed[7], input_packed[23]),
extract_lo_hi_f32(input_packed[8], input_packed[23]),
extract_hi_lo_f32(input_packed[8], input_packed[24]),
extract_lo_hi_f32(input_packed[9], input_packed[24]),
extract_hi_lo_f32(input_packed[9], input_packed[25]),
extract_lo_hi_f32(input_packed[10], input_packed[25]),
extract_hi_lo_f32(input_packed[10], input_packed[26]),
extract_lo_hi_f32(input_packed[11], input_packed[26]),
extract_hi_lo_f32(input_packed[11], input_packed[27]),
extract_lo_hi_f32(input_packed[12], input_packed[27]),
extract_hi_lo_f32(input_packed[12], input_packed[28]),
extract_lo_hi_f32(input_packed[13], input_packed[28]),
extract_hi_lo_f32(input_packed[13], input_packed[29]),
extract_lo_hi_f32(input_packed[14], input_packed[29]),
extract_hi_lo_f32(input_packed[14], input_packed[30]),
extract_lo_hi_f32(input_packed[15], input_packed[30]),
];
let out = self.perform_parallel_fft_direct(values);
let out_packed = [
extract_lo_lo_f32(out[0], out[1]),
extract_lo_lo_f32(out[2], out[3]),
extract_lo_lo_f32(out[4], out[5]),
extract_lo_lo_f32(out[6], out[7]),
extract_lo_lo_f32(out[8], out[9]),
extract_lo_lo_f32(out[10], out[11]),
extract_lo_lo_f32(out[12], out[13]),
extract_lo_lo_f32(out[14], out[15]),
extract_lo_lo_f32(out[16], out[17]),
extract_lo_lo_f32(out[18], out[19]),
extract_lo_lo_f32(out[20], out[21]),
extract_lo_lo_f32(out[22], out[23]),
extract_lo_lo_f32(out[24], out[25]),
extract_lo_lo_f32(out[26], out[27]),
extract_lo_lo_f32(out[28], out[29]),
extract_lo_hi_f32(out[30], out[0]),
extract_hi_hi_f32(out[1], out[2]),
extract_hi_hi_f32(out[3], out[4]),
extract_hi_hi_f32(out[5], out[6]),
extract_hi_hi_f32(out[7], out[8]),
extract_hi_hi_f32(out[9], out[10]),
extract_hi_hi_f32(out[11], out[12]),
extract_hi_hi_f32(out[13], out[14]),
extract_hi_hi_f32(out[15], out[16]),
extract_hi_hi_f32(out[17], out[18]),
extract_hi_hi_f32(out[19], out[20]),
extract_hi_hi_f32(out[21], out[22]),
extract_hi_hi_f32(out[23], out[24]),
extract_hi_hi_f32(out[25], out[26]),
extract_hi_hi_f32(out[27], out[28]),
extract_hi_hi_f32(out[29], out[30]),
];
write_complex_to_array_strided!(out_packed, buffer, 2, { 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30 });
}
#[inline(always)]
pub(crate) unsafe fn perform_parallel_fft_direct(&self, values: [__m128; 31]) -> [__m128; 31] {
let rotate = SseVector::make_rotate90(FftDirection::Inverse);
let y00 = values[0];
let [x1p30, x1m30] = SseVector::column_butterfly2([values[1], values[30]]);
let x1m30 = SseVector::apply_rotate90(rotate, x1m30);
let y00 = SseVector::add(y00, x1p30);
let [x2p29, x2m29] = SseVector::column_butterfly2([values[2], values[29]]);
let x2m29 = SseVector::apply_rotate90(rotate, x2m29);
let y00 = SseVector::add(y00, x2p29);
let [x3p28, x3m28] = SseVector::column_butterfly2([values[3], values[28]]);
let x3m28 = SseVector::apply_rotate90(rotate, x3m28);
let y00 = SseVector::add(y00, x3p28);
let [x4p27, x4m27] = SseVector::column_butterfly2([values[4], values[27]]);
let x4m27 = SseVector::apply_rotate90(rotate, x4m27);
let y00 = SseVector::add(y00, x4p27);
let [x5p26, x5m26] = SseVector::column_butterfly2([values[5], values[26]]);
let x5m26 = SseVector::apply_rotate90(rotate, x5m26);
let y00 = SseVector::add(y00, x5p26);
let [x6p25, x6m25] = SseVector::column_butterfly2([values[6], values[25]]);
let x6m25 = SseVector::apply_rotate90(rotate, x6m25);
let y00 = SseVector::add(y00, x6p25);
let [x7p24, x7m24] = SseVector::column_butterfly2([values[7], values[24]]);
let x7m24 = SseVector::apply_rotate90(rotate, x7m24);
let y00 = SseVector::add(y00, x7p24);
let [x8p23, x8m23] = SseVector::column_butterfly2([values[8], values[23]]);
let x8m23 = SseVector::apply_rotate90(rotate, x8m23);
let y00 = SseVector::add(y00, x8p23);
let [x9p22, x9m22] = SseVector::column_butterfly2([values[9], values[22]]);
let x9m22 = SseVector::apply_rotate90(rotate, x9m22);
let y00 = SseVector::add(y00, x9p22);
let [x10p21, x10m21] = SseVector::column_butterfly2([values[10], values[21]]);
let x10m21 = SseVector::apply_rotate90(rotate, x10m21);
let y00 = SseVector::add(y00, x10p21);
let [x11p20, x11m20] = SseVector::column_butterfly2([values[11], values[20]]);
let x11m20 = SseVector::apply_rotate90(rotate, x11m20);
let y00 = SseVector::add(y00, x11p20);
let [x12p19, x12m19] = SseVector::column_butterfly2([values[12], values[19]]);
let x12m19 = SseVector::apply_rotate90(rotate, x12m19);
let y00 = SseVector::add(y00, x12p19);
let [x13p18, x13m18] = SseVector::column_butterfly2([values[13], values[18]]);
let x13m18 = SseVector::apply_rotate90(rotate, x13m18);
let y00 = SseVector::add(y00, x13p18);
let [x14p17, x14m17] = SseVector::column_butterfly2([values[14], values[17]]);
let x14m17 = SseVector::apply_rotate90(rotate, x14m17);
let y00 = SseVector::add(y00, x14p17);
let [x15p16, x15m16] = SseVector::column_butterfly2([values[15], values[16]]);
let x15m16 = SseVector::apply_rotate90(rotate, x15m16);
let y00 = SseVector::add(y00, x15p16);
let m0130a = SseVector::fmadd(values[0], self.twiddles_re[0], x1p30);
let m0130a = SseVector::fmadd(m0130a, self.twiddles_re[1], x2p29);
let m0130a = SseVector::fmadd(m0130a, self.twiddles_re[2], x3p28);
let m0130a = SseVector::fmadd(m0130a, self.twiddles_re[3], x4p27);
let m0130a = SseVector::fmadd(m0130a, self.twiddles_re[4], x5p26);
let m0130a = SseVector::fmadd(m0130a, self.twiddles_re[5], x6p25);
let m0130a = SseVector::fmadd(m0130a, self.twiddles_re[6], x7p24);
let m0130a = SseVector::fmadd(m0130a, self.twiddles_re[7], x8p23);
let m0130a = SseVector::fmadd(m0130a, self.twiddles_re[8], x9p22);
let m0130a = SseVector::fmadd(m0130a, self.twiddles_re[9], x10p21);
let m0130a = SseVector::fmadd(m0130a, self.twiddles_re[10], x11p20);
let m0130a = SseVector::fmadd(m0130a, self.twiddles_re[11], x12p19);
let m0130a = SseVector::fmadd(m0130a, self.twiddles_re[12], x13p18);
let m0130a = SseVector::fmadd(m0130a, self.twiddles_re[13], x14p17);
let m0130a = SseVector::fmadd(m0130a, self.twiddles_re[14], x15p16);
let m0130b = SseVector::mul(self.twiddles_im[0], x1m30);
let m0130b = SseVector::fmadd(m0130b, self.twiddles_im[1], x2m29);
let m0130b = SseVector::fmadd(m0130b, self.twiddles_im[2], x3m28);
let m0130b = SseVector::fmadd(m0130b, self.twiddles_im[3], x4m27);
let m0130b = SseVector::fmadd(m0130b, self.twiddles_im[4], x5m26);
let m0130b = SseVector::fmadd(m0130b, self.twiddles_im[5], x6m25);
let m0130b = SseVector::fmadd(m0130b, self.twiddles_im[6], x7m24);
let m0130b = SseVector::fmadd(m0130b, self.twiddles_im[7], x8m23);
let m0130b = SseVector::fmadd(m0130b, self.twiddles_im[8], x9m22);
let m0130b = SseVector::fmadd(m0130b, self.twiddles_im[9], x10m21);
let m0130b = SseVector::fmadd(m0130b, self.twiddles_im[10], x11m20);
let m0130b = SseVector::fmadd(m0130b, self.twiddles_im[11], x12m19);
let m0130b = SseVector::fmadd(m0130b, self.twiddles_im[12], x13m18);
let m0130b = SseVector::fmadd(m0130b, self.twiddles_im[13], x14m17);
let m0130b = SseVector::fmadd(m0130b, self.twiddles_im[14], x15m16);
let [y01, y30] = SseVector::column_butterfly2([m0130a, m0130b]);
let m0229a = SseVector::fmadd(values[0], self.twiddles_re[1], x1p30);
let m0229a = SseVector::fmadd(m0229a, self.twiddles_re[3], x2p29);
let m0229a = SseVector::fmadd(m0229a, self.twiddles_re[5], x3p28);
let m0229a = SseVector::fmadd(m0229a, self.twiddles_re[7], x4p27);
let m0229a = SseVector::fmadd(m0229a, self.twiddles_re[9], x5p26);
let m0229a = SseVector::fmadd(m0229a, self.twiddles_re[11], x6p25);
let m0229a = SseVector::fmadd(m0229a, self.twiddles_re[13], x7p24);
let m0229a = SseVector::fmadd(m0229a, self.twiddles_re[14], x8p23);
let m0229a = SseVector::fmadd(m0229a, self.twiddles_re[12], x9p22);
let m0229a = SseVector::fmadd(m0229a, self.twiddles_re[10], x10p21);
let m0229a = SseVector::fmadd(m0229a, self.twiddles_re[8], x11p20);
let m0229a = SseVector::fmadd(m0229a, self.twiddles_re[6], x12p19);
let m0229a = SseVector::fmadd(m0229a, self.twiddles_re[4], x13p18);
let m0229a = SseVector::fmadd(m0229a, self.twiddles_re[2], x14p17);
let m0229a = SseVector::fmadd(m0229a, self.twiddles_re[0], x15p16);
let m0229b = SseVector::mul(self.twiddles_im[1], x1m30);
let m0229b = SseVector::fmadd(m0229b, self.twiddles_im[3], x2m29);
let m0229b = SseVector::fmadd(m0229b, self.twiddles_im[5], x3m28);
let m0229b = SseVector::fmadd(m0229b, self.twiddles_im[7], x4m27);
let m0229b = SseVector::fmadd(m0229b, self.twiddles_im[9], x5m26);
let m0229b = SseVector::fmadd(m0229b, self.twiddles_im[11], x6m25);
let m0229b = SseVector::fmadd(m0229b, self.twiddles_im[13], x7m24);
let m0229b = SseVector::nmadd(m0229b, self.twiddles_im[14], x8m23);
let m0229b = SseVector::nmadd(m0229b, self.twiddles_im[12], x9m22);
let m0229b = SseVector::nmadd(m0229b, self.twiddles_im[10], x10m21);
let m0229b = SseVector::nmadd(m0229b, self.twiddles_im[8], x11m20);
let m0229b = SseVector::nmadd(m0229b, self.twiddles_im[6], x12m19);
let m0229b = SseVector::nmadd(m0229b, self.twiddles_im[4], x13m18);
let m0229b = SseVector::nmadd(m0229b, self.twiddles_im[2], x14m17);
let m0229b = SseVector::nmadd(m0229b, self.twiddles_im[0], x15m16);
let [y02, y29] = SseVector::column_butterfly2([m0229a, m0229b]);
let m0328a = SseVector::fmadd(values[0], self.twiddles_re[2], x1p30);
let m0328a = SseVector::fmadd(m0328a, self.twiddles_re[5], x2p29);
let m0328a = SseVector::fmadd(m0328a, self.twiddles_re[8], x3p28);
let m0328a = SseVector::fmadd(m0328a, self.twiddles_re[11], x4p27);
let m0328a = SseVector::fmadd(m0328a, self.twiddles_re[14], x5p26);
let m0328a = SseVector::fmadd(m0328a, self.twiddles_re[12], x6p25);
let m0328a = SseVector::fmadd(m0328a, self.twiddles_re[9], x7p24);
let m0328a = SseVector::fmadd(m0328a, self.twiddles_re[6], x8p23);
let m0328a = SseVector::fmadd(m0328a, self.twiddles_re[3], x9p22);
let m0328a = SseVector::fmadd(m0328a, self.twiddles_re[0], x10p21);
let m0328a = SseVector::fmadd(m0328a, self.twiddles_re[1], x11p20);
let m0328a = SseVector::fmadd(m0328a, self.twiddles_re[4], x12p19);
let m0328a = SseVector::fmadd(m0328a, self.twiddles_re[7], x13p18);
let m0328a = SseVector::fmadd(m0328a, self.twiddles_re[10], x14p17);
let m0328a = SseVector::fmadd(m0328a, self.twiddles_re[13], x15p16);
let m0328b = SseVector::mul(self.twiddles_im[2], x1m30);
let m0328b = SseVector::fmadd(m0328b, self.twiddles_im[5], x2m29);
let m0328b = SseVector::fmadd(m0328b, self.twiddles_im[8], x3m28);
let m0328b = SseVector::fmadd(m0328b, self.twiddles_im[11], x4m27);
let m0328b = SseVector::fmadd(m0328b, self.twiddles_im[14], x5m26);
let m0328b = SseVector::nmadd(m0328b, self.twiddles_im[12], x6m25);
let m0328b = SseVector::nmadd(m0328b, self.twiddles_im[9], x7m24);
let m0328b = SseVector::nmadd(m0328b, self.twiddles_im[6], x8m23);
let m0328b = SseVector::nmadd(m0328b, self.twiddles_im[3], x9m22);
let m0328b = SseVector::nmadd(m0328b, self.twiddles_im[0], x10m21);
let m0328b = SseVector::fmadd(m0328b, self.twiddles_im[1], x11m20);
let m0328b = SseVector::fmadd(m0328b, self.twiddles_im[4], x12m19);
let m0328b = SseVector::fmadd(m0328b, self.twiddles_im[7], x13m18);
let m0328b = SseVector::fmadd(m0328b, self.twiddles_im[10], x14m17);
let m0328b = SseVector::fmadd(m0328b, self.twiddles_im[13], x15m16);
let [y03, y28] = SseVector::column_butterfly2([m0328a, m0328b]);
let m0427a = SseVector::fmadd(values[0], self.twiddles_re[3], x1p30);
let m0427a = SseVector::fmadd(m0427a, self.twiddles_re[7], x2p29);
let m0427a = SseVector::fmadd(m0427a, self.twiddles_re[11], x3p28);
let m0427a = SseVector::fmadd(m0427a, self.twiddles_re[14], x4p27);
let m0427a = SseVector::fmadd(m0427a, self.twiddles_re[10], x5p26);
let m0427a = SseVector::fmadd(m0427a, self.twiddles_re[6], x6p25);
let m0427a = SseVector::fmadd(m0427a, self.twiddles_re[2], x7p24);
let m0427a = SseVector::fmadd(m0427a, self.twiddles_re[0], x8p23);
let m0427a = SseVector::fmadd(m0427a, self.twiddles_re[4], x9p22);
let m0427a = SseVector::fmadd(m0427a, self.twiddles_re[8], x10p21);
let m0427a = SseVector::fmadd(m0427a, self.twiddles_re[12], x11p20);
let m0427a = SseVector::fmadd(m0427a, self.twiddles_re[13], x12p19);
let m0427a = SseVector::fmadd(m0427a, self.twiddles_re[9], x13p18);
let m0427a = SseVector::fmadd(m0427a, self.twiddles_re[5], x14p17);
let m0427a = SseVector::fmadd(m0427a, self.twiddles_re[1], x15p16);
let m0427b = SseVector::mul(self.twiddles_im[3], x1m30);
let m0427b = SseVector::fmadd(m0427b, self.twiddles_im[7], x2m29);
let m0427b = SseVector::fmadd(m0427b, self.twiddles_im[11], x3m28);
let m0427b = SseVector::nmadd(m0427b, self.twiddles_im[14], x4m27);
let m0427b = SseVector::nmadd(m0427b, self.twiddles_im[10], x5m26);
let m0427b = SseVector::nmadd(m0427b, self.twiddles_im[6], x6m25);
let m0427b = SseVector::nmadd(m0427b, self.twiddles_im[2], x7m24);
let m0427b = SseVector::fmadd(m0427b, self.twiddles_im[0], x8m23);
let m0427b = SseVector::fmadd(m0427b, self.twiddles_im[4], x9m22);
let m0427b = SseVector::fmadd(m0427b, self.twiddles_im[8], x10m21);
let m0427b = SseVector::fmadd(m0427b, self.twiddles_im[12], x11m20);
let m0427b = SseVector::nmadd(m0427b, self.twiddles_im[13], x12m19);
let m0427b = SseVector::nmadd(m0427b, self.twiddles_im[9], x13m18);
let m0427b = SseVector::nmadd(m0427b, self.twiddles_im[5], x14m17);
let m0427b = SseVector::nmadd(m0427b, self.twiddles_im[1], x15m16);
let [y04, y27] = SseVector::column_butterfly2([m0427a, m0427b]);
let m0526a = SseVector::fmadd(values[0], self.twiddles_re[4], x1p30);
let m0526a = SseVector::fmadd(m0526a, self.twiddles_re[9], x2p29);
let m0526a = SseVector::fmadd(m0526a, self.twiddles_re[14], x3p28);
let m0526a = SseVector::fmadd(m0526a, self.twiddles_re[10], x4p27);
let m0526a = SseVector::fmadd(m0526a, self.twiddles_re[5], x5p26);
let m0526a = SseVector::fmadd(m0526a, self.twiddles_re[0], x6p25);
let m0526a = SseVector::fmadd(m0526a, self.twiddles_re[3], x7p24);
let m0526a = SseVector::fmadd(m0526a, self.twiddles_re[8], x8p23);
let m0526a = SseVector::fmadd(m0526a, self.twiddles_re[13], x9p22);
let m0526a = SseVector::fmadd(m0526a, self.twiddles_re[11], x10p21);
let m0526a = SseVector::fmadd(m0526a, self.twiddles_re[6], x11p20);
let m0526a = SseVector::fmadd(m0526a, self.twiddles_re[1], x12p19);
let m0526a = SseVector::fmadd(m0526a, self.twiddles_re[2], x13p18);
let m0526a = SseVector::fmadd(m0526a, self.twiddles_re[7], x14p17);
let m0526a = SseVector::fmadd(m0526a, self.twiddles_re[12], x15p16);
let m0526b = SseVector::mul(self.twiddles_im[4], x1m30);
let m0526b = SseVector::fmadd(m0526b, self.twiddles_im[9], x2m29);
let m0526b = SseVector::fmadd(m0526b, self.twiddles_im[14], x3m28);
let m0526b = SseVector::nmadd(m0526b, self.twiddles_im[10], x4m27);
let m0526b = SseVector::nmadd(m0526b, self.twiddles_im[5], x5m26);
let m0526b = SseVector::nmadd(m0526b, self.twiddles_im[0], x6m25);
let m0526b = SseVector::fmadd(m0526b, self.twiddles_im[3], x7m24);
let m0526b = SseVector::fmadd(m0526b, self.twiddles_im[8], x8m23);
let m0526b = SseVector::fmadd(m0526b, self.twiddles_im[13], x9m22);
let m0526b = SseVector::nmadd(m0526b, self.twiddles_im[11], x10m21);
let m0526b = SseVector::nmadd(m0526b, self.twiddles_im[6], x11m20);
let m0526b = SseVector::nmadd(m0526b, self.twiddles_im[1], x12m19);
let m0526b = SseVector::fmadd(m0526b, self.twiddles_im[2], x13m18);
let m0526b = SseVector::fmadd(m0526b, self.twiddles_im[7], x14m17);
let m0526b = SseVector::fmadd(m0526b, self.twiddles_im[12], x15m16);
let [y05, y26] = SseVector::column_butterfly2([m0526a, m0526b]);
let m0625a = SseVector::fmadd(values[0], self.twiddles_re[5], x1p30);
let m0625a = SseVector::fmadd(m0625a, self.twiddles_re[11], x2p29);
let m0625a = SseVector::fmadd(m0625a, self.twiddles_re[12], x3p28);
let m0625a = SseVector::fmadd(m0625a, self.twiddles_re[6], x4p27);
let m0625a = SseVector::fmadd(m0625a, self.twiddles_re[0], x5p26);
let m0625a = SseVector::fmadd(m0625a, self.twiddles_re[4], x6p25);
let m0625a = SseVector::fmadd(m0625a, self.twiddles_re[10], x7p24);
let m0625a = SseVector::fmadd(m0625a, self.twiddles_re[13], x8p23);
let m0625a = SseVector::fmadd(m0625a, self.twiddles_re[7], x9p22);
let m0625a = SseVector::fmadd(m0625a, self.twiddles_re[1], x10p21);
let m0625a = SseVector::fmadd(m0625a, self.twiddles_re[3], x11p20);
let m0625a = SseVector::fmadd(m0625a, self.twiddles_re[9], x12p19);
let m0625a = SseVector::fmadd(m0625a, self.twiddles_re[14], x13p18);
let m0625a = SseVector::fmadd(m0625a, self.twiddles_re[8], x14p17);
let m0625a = SseVector::fmadd(m0625a, self.twiddles_re[2], x15p16);
let m0625b = SseVector::mul(self.twiddles_im[5], x1m30);
let m0625b = SseVector::fmadd(m0625b, self.twiddles_im[11], x2m29);
let m0625b = SseVector::nmadd(m0625b, self.twiddles_im[12], x3m28);
let m0625b = SseVector::nmadd(m0625b, self.twiddles_im[6], x4m27);
let m0625b = SseVector::nmadd(m0625b, self.twiddles_im[0], x5m26);
let m0625b = SseVector::fmadd(m0625b, self.twiddles_im[4], x6m25);
let m0625b = SseVector::fmadd(m0625b, self.twiddles_im[10], x7m24);
let m0625b = SseVector::nmadd(m0625b, self.twiddles_im[13], x8m23);
let m0625b = SseVector::nmadd(m0625b, self.twiddles_im[7], x9m22);
let m0625b = SseVector::nmadd(m0625b, self.twiddles_im[1], x10m21);
let m0625b = SseVector::fmadd(m0625b, self.twiddles_im[3], x11m20);
let m0625b = SseVector::fmadd(m0625b, self.twiddles_im[9], x12m19);
let m0625b = SseVector::nmadd(m0625b, self.twiddles_im[14], x13m18);
let m0625b = SseVector::nmadd(m0625b, self.twiddles_im[8], x14m17);
let m0625b = SseVector::nmadd(m0625b, self.twiddles_im[2], x15m16);
let [y06, y25] = SseVector::column_butterfly2([m0625a, m0625b]);
let m0724a = SseVector::fmadd(values[0], self.twiddles_re[6], x1p30);
let m0724a = SseVector::fmadd(m0724a, self.twiddles_re[13], x2p29);
let m0724a = SseVector::fmadd(m0724a, self.twiddles_re[9], x3p28);
let m0724a = SseVector::fmadd(m0724a, self.twiddles_re[2], x4p27);
let m0724a = SseVector::fmadd(m0724a, self.twiddles_re[3], x5p26);
let m0724a = SseVector::fmadd(m0724a, self.twiddles_re[10], x6p25);
let m0724a = SseVector::fmadd(m0724a, self.twiddles_re[12], x7p24);
let m0724a = SseVector::fmadd(m0724a, self.twiddles_re[5], x8p23);
let m0724a = SseVector::fmadd(m0724a, self.twiddles_re[0], x9p22);
let m0724a = SseVector::fmadd(m0724a, self.twiddles_re[7], x10p21);
let m0724a = SseVector::fmadd(m0724a, self.twiddles_re[14], x11p20);
let m0724a = SseVector::fmadd(m0724a, self.twiddles_re[8], x12p19);
let m0724a = SseVector::fmadd(m0724a, self.twiddles_re[1], x13p18);
let m0724a = SseVector::fmadd(m0724a, self.twiddles_re[4], x14p17);
let m0724a = SseVector::fmadd(m0724a, self.twiddles_re[11], x15p16);
let m0724b = SseVector::mul(self.twiddles_im[6], x1m30);
let m0724b = SseVector::fmadd(m0724b, self.twiddles_im[13], x2m29);
let m0724b = SseVector::nmadd(m0724b, self.twiddles_im[9], x3m28);
let m0724b = SseVector::nmadd(m0724b, self.twiddles_im[2], x4m27);
let m0724b = SseVector::fmadd(m0724b, self.twiddles_im[3], x5m26);
let m0724b = SseVector::fmadd(m0724b, self.twiddles_im[10], x6m25);
let m0724b = SseVector::nmadd(m0724b, self.twiddles_im[12], x7m24);
let m0724b = SseVector::nmadd(m0724b, self.twiddles_im[5], x8m23);
let m0724b = SseVector::fmadd(m0724b, self.twiddles_im[0], x9m22);
let m0724b = SseVector::fmadd(m0724b, self.twiddles_im[7], x10m21);
let m0724b = SseVector::fmadd(m0724b, self.twiddles_im[14], x11m20);
let m0724b = SseVector::nmadd(m0724b, self.twiddles_im[8], x12m19);
let m0724b = SseVector::nmadd(m0724b, self.twiddles_im[1], x13m18);
let m0724b = SseVector::fmadd(m0724b, self.twiddles_im[4], x14m17);
let m0724b = SseVector::fmadd(m0724b, self.twiddles_im[11], x15m16);
let [y07, y24] = SseVector::column_butterfly2([m0724a, m0724b]);
let m0823a = SseVector::fmadd(values[0], self.twiddles_re[7], x1p30);
let m0823a = SseVector::fmadd(m0823a, self.twiddles_re[14], x2p29);
let m0823a = SseVector::fmadd(m0823a, self.twiddles_re[6], x3p28);
let m0823a = SseVector::fmadd(m0823a, self.twiddles_re[0], x4p27);
let m0823a = SseVector::fmadd(m0823a, self.twiddles_re[8], x5p26);
let m0823a = SseVector::fmadd(m0823a, self.twiddles_re[13], x6p25);
let m0823a = SseVector::fmadd(m0823a, self.twiddles_re[5], x7p24);
let m0823a = SseVector::fmadd(m0823a, self.twiddles_re[1], x8p23);
let m0823a = SseVector::fmadd(m0823a, self.twiddles_re[9], x9p22);
let m0823a = SseVector::fmadd(m0823a, self.twiddles_re[12], x10p21);
let m0823a = SseVector::fmadd(m0823a, self.twiddles_re[4], x11p20);
let m0823a = SseVector::fmadd(m0823a, self.twiddles_re[2], x12p19);
let m0823a = SseVector::fmadd(m0823a, self.twiddles_re[10], x13p18);
let m0823a = SseVector::fmadd(m0823a, self.twiddles_re[11], x14p17);
let m0823a = SseVector::fmadd(m0823a, self.twiddles_re[3], x15p16);
let m0823b = SseVector::mul(self.twiddles_im[7], x1m30);
let m0823b = SseVector::nmadd(m0823b, self.twiddles_im[14], x2m29);
let m0823b = SseVector::nmadd(m0823b, self.twiddles_im[6], x3m28);
let m0823b = SseVector::fmadd(m0823b, self.twiddles_im[0], x4m27);
let m0823b = SseVector::fmadd(m0823b, self.twiddles_im[8], x5m26);
let m0823b = SseVector::nmadd(m0823b, self.twiddles_im[13], x6m25);
let m0823b = SseVector::nmadd(m0823b, self.twiddles_im[5], x7m24);
let m0823b = SseVector::fmadd(m0823b, self.twiddles_im[1], x8m23);
let m0823b = SseVector::fmadd(m0823b, self.twiddles_im[9], x9m22);
let m0823b = SseVector::nmadd(m0823b, self.twiddles_im[12], x10m21);
let m0823b = SseVector::nmadd(m0823b, self.twiddles_im[4], x11m20);
let m0823b = SseVector::fmadd(m0823b, self.twiddles_im[2], x12m19);
let m0823b = SseVector::fmadd(m0823b, self.twiddles_im[10], x13m18);
let m0823b = SseVector::nmadd(m0823b, self.twiddles_im[11], x14m17);
let m0823b = SseVector::nmadd(m0823b, self.twiddles_im[3], x15m16);
let [y08, y23] = SseVector::column_butterfly2([m0823a, m0823b]);
let m0922a = SseVector::fmadd(values[0], self.twiddles_re[8], x1p30);
let m0922a = SseVector::fmadd(m0922a, self.twiddles_re[12], x2p29);
let m0922a = SseVector::fmadd(m0922a, self.twiddles_re[3], x3p28);
let m0922a = SseVector::fmadd(m0922a, self.twiddles_re[4], x4p27);
let m0922a = SseVector::fmadd(m0922a, self.twiddles_re[13], x5p26);
let m0922a = SseVector::fmadd(m0922a, self.twiddles_re[7], x6p25);
let m0922a = SseVector::fmadd(m0922a, self.twiddles_re[0], x7p24);
let m0922a = SseVector::fmadd(m0922a, self.twiddles_re[9], x8p23);
let m0922a = SseVector::fmadd(m0922a, self.twiddles_re[11], x9p22);
let m0922a = SseVector::fmadd(m0922a, self.twiddles_re[2], x10p21);
let m0922a = SseVector::fmadd(m0922a, self.twiddles_re[5], x11p20);
let m0922a = SseVector::fmadd(m0922a, self.twiddles_re[14], x12p19);
let m0922a = SseVector::fmadd(m0922a, self.twiddles_re[6], x13p18);
let m0922a = SseVector::fmadd(m0922a, self.twiddles_re[1], x14p17);
let m0922a = SseVector::fmadd(m0922a, self.twiddles_re[10], x15p16);
let m0922b = SseVector::mul(self.twiddles_im[8], x1m30);
let m0922b = SseVector::nmadd(m0922b, self.twiddles_im[12], x2m29);
let m0922b = SseVector::nmadd(m0922b, self.twiddles_im[3], x3m28);
let m0922b = SseVector::fmadd(m0922b, self.twiddles_im[4], x4m27);
let m0922b = SseVector::fmadd(m0922b, self.twiddles_im[13], x5m26);
let m0922b = SseVector::nmadd(m0922b, self.twiddles_im[7], x6m25);
let m0922b = SseVector::fmadd(m0922b, self.twiddles_im[0], x7m24);
let m0922b = SseVector::fmadd(m0922b, self.twiddles_im[9], x8m23);
let m0922b = SseVector::nmadd(m0922b, self.twiddles_im[11], x9m22);
let m0922b = SseVector::nmadd(m0922b, self.twiddles_im[2], x10m21);
let m0922b = SseVector::fmadd(m0922b, self.twiddles_im[5], x11m20);
let m0922b = SseVector::fmadd(m0922b, self.twiddles_im[14], x12m19);
let m0922b = SseVector::nmadd(m0922b, self.twiddles_im[6], x13m18);
let m0922b = SseVector::fmadd(m0922b, self.twiddles_im[1], x14m17);
let m0922b = SseVector::fmadd(m0922b, self.twiddles_im[10], x15m16);
let [y09, y22] = SseVector::column_butterfly2([m0922a, m0922b]);
let m1021a = SseVector::fmadd(values[0], self.twiddles_re[9], x1p30);
let m1021a = SseVector::fmadd(m1021a, self.twiddles_re[10], x2p29);
let m1021a = SseVector::fmadd(m1021a, self.twiddles_re[0], x3p28);
let m1021a = SseVector::fmadd(m1021a, self.twiddles_re[8], x4p27);
let m1021a = SseVector::fmadd(m1021a, self.twiddles_re[11], x5p26);
let m1021a = SseVector::fmadd(m1021a, self.twiddles_re[1], x6p25);
let m1021a = SseVector::fmadd(m1021a, self.twiddles_re[7], x7p24);
let m1021a = SseVector::fmadd(m1021a, self.twiddles_re[12], x8p23);
let m1021a = SseVector::fmadd(m1021a, self.twiddles_re[2], x9p22);
let m1021a = SseVector::fmadd(m1021a, self.twiddles_re[6], x10p21);
let m1021a = SseVector::fmadd(m1021a, self.twiddles_re[13], x11p20);
let m1021a = SseVector::fmadd(m1021a, self.twiddles_re[3], x12p19);
let m1021a = SseVector::fmadd(m1021a, self.twiddles_re[5], x13p18);
let m1021a = SseVector::fmadd(m1021a, self.twiddles_re[14], x14p17);
let m1021a = SseVector::fmadd(m1021a, self.twiddles_re[4], x15p16);
let m1021b = SseVector::mul(self.twiddles_im[9], x1m30);
let m1021b = SseVector::nmadd(m1021b, self.twiddles_im[10], x2m29);
let m1021b = SseVector::nmadd(m1021b, self.twiddles_im[0], x3m28);
let m1021b = SseVector::fmadd(m1021b, self.twiddles_im[8], x4m27);
let m1021b = SseVector::nmadd(m1021b, self.twiddles_im[11], x5m26);
let m1021b = SseVector::nmadd(m1021b, self.twiddles_im[1], x6m25);
let m1021b = SseVector::fmadd(m1021b, self.twiddles_im[7], x7m24);
let m1021b = SseVector::nmadd(m1021b, self.twiddles_im[12], x8m23);
let m1021b = SseVector::nmadd(m1021b, self.twiddles_im[2], x9m22);
let m1021b = SseVector::fmadd(m1021b, self.twiddles_im[6], x10m21);
let m1021b = SseVector::nmadd(m1021b, self.twiddles_im[13], x11m20);
let m1021b = SseVector::nmadd(m1021b, self.twiddles_im[3], x12m19);
let m1021b = SseVector::fmadd(m1021b, self.twiddles_im[5], x13m18);
let m1021b = SseVector::nmadd(m1021b, self.twiddles_im[14], x14m17);
let m1021b = SseVector::nmadd(m1021b, self.twiddles_im[4], x15m16);
let [y10, y21] = SseVector::column_butterfly2([m1021a, m1021b]);
let m1120a = SseVector::fmadd(values[0], self.twiddles_re[10], x1p30);
let m1120a = SseVector::fmadd(m1120a, self.twiddles_re[8], x2p29);
let m1120a = SseVector::fmadd(m1120a, self.twiddles_re[1], x3p28);
let m1120a = SseVector::fmadd(m1120a, self.twiddles_re[12], x4p27);
let m1120a = SseVector::fmadd(m1120a, self.twiddles_re[6], x5p26);
let m1120a = SseVector::fmadd(m1120a, self.twiddles_re[3], x6p25);
let m1120a = SseVector::fmadd(m1120a, self.twiddles_re[14], x7p24);
let m1120a = SseVector::fmadd(m1120a, self.twiddles_re[4], x8p23);
let m1120a = SseVector::fmadd(m1120a, self.twiddles_re[5], x9p22);
let m1120a = SseVector::fmadd(m1120a, self.twiddles_re[13], x10p21);
let m1120a = SseVector::fmadd(m1120a, self.twiddles_re[2], x11p20);
let m1120a = SseVector::fmadd(m1120a, self.twiddles_re[7], x12p19);
let m1120a = SseVector::fmadd(m1120a, self.twiddles_re[11], x13p18);
let m1120a = SseVector::fmadd(m1120a, self.twiddles_re[0], x14p17);
let m1120a = SseVector::fmadd(m1120a, self.twiddles_re[9], x15p16);
let m1120b = SseVector::mul(self.twiddles_im[10], x1m30);
let m1120b = SseVector::nmadd(m1120b, self.twiddles_im[8], x2m29);
let m1120b = SseVector::fmadd(m1120b, self.twiddles_im[1], x3m28);
let m1120b = SseVector::fmadd(m1120b, self.twiddles_im[12], x4m27);
let m1120b = SseVector::nmadd(m1120b, self.twiddles_im[6], x5m26);
let m1120b = SseVector::fmadd(m1120b, self.twiddles_im[3], x6m25);
let m1120b = SseVector::fmadd(m1120b, self.twiddles_im[14], x7m24);
let m1120b = SseVector::nmadd(m1120b, self.twiddles_im[4], x8m23);
let m1120b = SseVector::fmadd(m1120b, self.twiddles_im[5], x9m22);
let m1120b = SseVector::nmadd(m1120b, self.twiddles_im[13], x10m21);
let m1120b = SseVector::nmadd(m1120b, self.twiddles_im[2], x11m20);
let m1120b = SseVector::fmadd(m1120b, self.twiddles_im[7], x12m19);
let m1120b = SseVector::nmadd(m1120b, self.twiddles_im[11], x13m18);
let m1120b = SseVector::nmadd(m1120b, self.twiddles_im[0], x14m17);
let m1120b = SseVector::fmadd(m1120b, self.twiddles_im[9], x15m16);
let [y11, y20] = SseVector::column_butterfly2([m1120a, m1120b]);
let m1219a = SseVector::fmadd(values[0], self.twiddles_re[11], x1p30);
let m1219a = SseVector::fmadd(m1219a, self.twiddles_re[6], x2p29);
let m1219a = SseVector::fmadd(m1219a, self.twiddles_re[4], x3p28);
let m1219a = SseVector::fmadd(m1219a, self.twiddles_re[13], x4p27);
let m1219a = SseVector::fmadd(m1219a, self.twiddles_re[1], x5p26);
let m1219a = SseVector::fmadd(m1219a, self.twiddles_re[9], x6p25);
let m1219a = SseVector::fmadd(m1219a, self.twiddles_re[8], x7p24);
let m1219a = SseVector::fmadd(m1219a, self.twiddles_re[2], x8p23);
let m1219a = SseVector::fmadd(m1219a, self.twiddles_re[14], x9p22);
let m1219a = SseVector::fmadd(m1219a, self.twiddles_re[3], x10p21);
let m1219a = SseVector::fmadd(m1219a, self.twiddles_re[7], x11p20);
let m1219a = SseVector::fmadd(m1219a, self.twiddles_re[10], x12p19);
let m1219a = SseVector::fmadd(m1219a, self.twiddles_re[0], x13p18);
let m1219a = SseVector::fmadd(m1219a, self.twiddles_re[12], x14p17);
let m1219a = SseVector::fmadd(m1219a, self.twiddles_re[5], x15p16);
let m1219b = SseVector::mul(self.twiddles_im[11], x1m30);
let m1219b = SseVector::nmadd(m1219b, self.twiddles_im[6], x2m29);
let m1219b = SseVector::fmadd(m1219b, self.twiddles_im[4], x3m28);
let m1219b = SseVector::nmadd(m1219b, self.twiddles_im[13], x4m27);
let m1219b = SseVector::nmadd(m1219b, self.twiddles_im[1], x5m26);
let m1219b = SseVector::fmadd(m1219b, self.twiddles_im[9], x6m25);
let m1219b = SseVector::nmadd(m1219b, self.twiddles_im[8], x7m24);
let m1219b = SseVector::fmadd(m1219b, self.twiddles_im[2], x8m23);
let m1219b = SseVector::fmadd(m1219b, self.twiddles_im[14], x9m22);
let m1219b = SseVector::nmadd(m1219b, self.twiddles_im[3], x10m21);
let m1219b = SseVector::fmadd(m1219b, self.twiddles_im[7], x11m20);
let m1219b = SseVector::nmadd(m1219b, self.twiddles_im[10], x12m19);
let m1219b = SseVector::fmadd(m1219b, self.twiddles_im[0], x13m18);
let m1219b = SseVector::fmadd(m1219b, self.twiddles_im[12], x14m17);
let m1219b = SseVector::nmadd(m1219b, self.twiddles_im[5], x15m16);
let [y12, y19] = SseVector::column_butterfly2([m1219a, m1219b]);
let m1318a = SseVector::fmadd(values[0], self.twiddles_re[12], x1p30);
let m1318a = SseVector::fmadd(m1318a, self.twiddles_re[4], x2p29);
let m1318a = SseVector::fmadd(m1318a, self.twiddles_re[7], x3p28);
let m1318a = SseVector::fmadd(m1318a, self.twiddles_re[9], x4p27);
let m1318a = SseVector::fmadd(m1318a, self.twiddles_re[2], x5p26);
let m1318a = SseVector::fmadd(m1318a, self.twiddles_re[14], x6p25);
let m1318a = SseVector::fmadd(m1318a, self.twiddles_re[1], x7p24);
let m1318a = SseVector::fmadd(m1318a, self.twiddles_re[10], x8p23);
let m1318a = SseVector::fmadd(m1318a, self.twiddles_re[6], x9p22);
let m1318a = SseVector::fmadd(m1318a, self.twiddles_re[5], x10p21);
let m1318a = SseVector::fmadd(m1318a, self.twiddles_re[11], x11p20);
let m1318a = SseVector::fmadd(m1318a, self.twiddles_re[0], x12p19);
let m1318a = SseVector::fmadd(m1318a, self.twiddles_re[13], x13p18);
let m1318a = SseVector::fmadd(m1318a, self.twiddles_re[3], x14p17);
let m1318a = SseVector::fmadd(m1318a, self.twiddles_re[8], x15p16);
let m1318b = SseVector::mul(self.twiddles_im[12], x1m30);
let m1318b = SseVector::nmadd(m1318b, self.twiddles_im[4], x2m29);
let m1318b = SseVector::fmadd(m1318b, self.twiddles_im[7], x3m28);
let m1318b = SseVector::nmadd(m1318b, self.twiddles_im[9], x4m27);
let m1318b = SseVector::fmadd(m1318b, self.twiddles_im[2], x5m26);
let m1318b = SseVector::nmadd(m1318b, self.twiddles_im[14], x6m25);
let m1318b = SseVector::nmadd(m1318b, self.twiddles_im[1], x7m24);
let m1318b = SseVector::fmadd(m1318b, self.twiddles_im[10], x8m23);
let m1318b = SseVector::nmadd(m1318b, self.twiddles_im[6], x9m22);
let m1318b = SseVector::fmadd(m1318b, self.twiddles_im[5], x10m21);
let m1318b = SseVector::nmadd(m1318b, self.twiddles_im[11], x11m20);
let m1318b = SseVector::fmadd(m1318b, self.twiddles_im[0], x12m19);
let m1318b = SseVector::fmadd(m1318b, self.twiddles_im[13], x13m18);
let m1318b = SseVector::nmadd(m1318b, self.twiddles_im[3], x14m17);
let m1318b = SseVector::fmadd(m1318b, self.twiddles_im[8], x15m16);
let [y13, y18] = SseVector::column_butterfly2([m1318a, m1318b]);
let m1417a = SseVector::fmadd(values[0], self.twiddles_re[13], x1p30);
let m1417a = SseVector::fmadd(m1417a, self.twiddles_re[2], x2p29);
let m1417a = SseVector::fmadd(m1417a, self.twiddles_re[10], x3p28);
let m1417a = SseVector::fmadd(m1417a, self.twiddles_re[5], x4p27);
let m1417a = SseVector::fmadd(m1417a, self.twiddles_re[7], x5p26);
let m1417a = SseVector::fmadd(m1417a, self.twiddles_re[8], x6p25);
let m1417a = SseVector::fmadd(m1417a, self.twiddles_re[4], x7p24);
let m1417a = SseVector::fmadd(m1417a, self.twiddles_re[11], x8p23);
let m1417a = SseVector::fmadd(m1417a, self.twiddles_re[1], x9p22);
let m1417a = SseVector::fmadd(m1417a, self.twiddles_re[14], x10p21);
let m1417a = SseVector::fmadd(m1417a, self.twiddles_re[0], x11p20);
let m1417a = SseVector::fmadd(m1417a, self.twiddles_re[12], x12p19);
let m1417a = SseVector::fmadd(m1417a, self.twiddles_re[3], x13p18);
let m1417a = SseVector::fmadd(m1417a, self.twiddles_re[9], x14p17);
let m1417a = SseVector::fmadd(m1417a, self.twiddles_re[6], x15p16);
let m1417b = SseVector::mul(self.twiddles_im[13], x1m30);
let m1417b = SseVector::nmadd(m1417b, self.twiddles_im[2], x2m29);
let m1417b = SseVector::fmadd(m1417b, self.twiddles_im[10], x3m28);
let m1417b = SseVector::nmadd(m1417b, self.twiddles_im[5], x4m27);
let m1417b = SseVector::fmadd(m1417b, self.twiddles_im[7], x5m26);
let m1417b = SseVector::nmadd(m1417b, self.twiddles_im[8], x6m25);
let m1417b = SseVector::fmadd(m1417b, self.twiddles_im[4], x7m24);
let m1417b = SseVector::nmadd(m1417b, self.twiddles_im[11], x8m23);
let m1417b = SseVector::fmadd(m1417b, self.twiddles_im[1], x9m22);
let m1417b = SseVector::nmadd(m1417b, self.twiddles_im[14], x10m21);
let m1417b = SseVector::nmadd(m1417b, self.twiddles_im[0], x11m20);
let m1417b = SseVector::fmadd(m1417b, self.twiddles_im[12], x12m19);
let m1417b = SseVector::nmadd(m1417b, self.twiddles_im[3], x13m18);
let m1417b = SseVector::fmadd(m1417b, self.twiddles_im[9], x14m17);
let m1417b = SseVector::nmadd(m1417b, self.twiddles_im[6], x15m16);
let [y14, y17] = SseVector::column_butterfly2([m1417a, m1417b]);
let m1516a = SseVector::fmadd(values[0], self.twiddles_re[14], x1p30);
let m1516a = SseVector::fmadd(m1516a, self.twiddles_re[0], x2p29);
let m1516a = SseVector::fmadd(m1516a, self.twiddles_re[13], x3p28);
let m1516a = SseVector::fmadd(m1516a, self.twiddles_re[1], x4p27);
let m1516a = SseVector::fmadd(m1516a, self.twiddles_re[12], x5p26);
let m1516a = SseVector::fmadd(m1516a, self.twiddles_re[2], x6p25);
let m1516a = SseVector::fmadd(m1516a, self.twiddles_re[11], x7p24);
let m1516a = SseVector::fmadd(m1516a, self.twiddles_re[3], x8p23);
let m1516a = SseVector::fmadd(m1516a, self.twiddles_re[10], x9p22);
let m1516a = SseVector::fmadd(m1516a, self.twiddles_re[4], x10p21);
let m1516a = SseVector::fmadd(m1516a, self.twiddles_re[9], x11p20);
let m1516a = SseVector::fmadd(m1516a, self.twiddles_re[5], x12p19);
let m1516a = SseVector::fmadd(m1516a, self.twiddles_re[8], x13p18);
let m1516a = SseVector::fmadd(m1516a, self.twiddles_re[6], x14p17);
let m1516a = SseVector::fmadd(m1516a, self.twiddles_re[7], x15p16);
let m1516b = SseVector::mul(self.twiddles_im[14], x1m30);
let m1516b = SseVector::nmadd(m1516b, self.twiddles_im[0], x2m29);
let m1516b = SseVector::fmadd(m1516b, self.twiddles_im[13], x3m28);
let m1516b = SseVector::nmadd(m1516b, self.twiddles_im[1], x4m27);
let m1516b = SseVector::fmadd(m1516b, self.twiddles_im[12], x5m26);
let m1516b = SseVector::nmadd(m1516b, self.twiddles_im[2], x6m25);
let m1516b = SseVector::fmadd(m1516b, self.twiddles_im[11], x7m24);
let m1516b = SseVector::nmadd(m1516b, self.twiddles_im[3], x8m23);
let m1516b = SseVector::fmadd(m1516b, self.twiddles_im[10], x9m22);
let m1516b = SseVector::nmadd(m1516b, self.twiddles_im[4], x10m21);
let m1516b = SseVector::fmadd(m1516b, self.twiddles_im[9], x11m20);
let m1516b = SseVector::nmadd(m1516b, self.twiddles_im[5], x12m19);
let m1516b = SseVector::fmadd(m1516b, self.twiddles_im[8], x13m18);
let m1516b = SseVector::nmadd(m1516b, self.twiddles_im[6], x14m17);
let m1516b = SseVector::fmadd(m1516b, self.twiddles_im[7], x15m16);
let [y15, y16] = SseVector::column_butterfly2([m1516a, m1516b]);
[y00, y01, y02, y03, y04, y05, y06, y07, y08, y09, y10, y11, y12, y13, y14, y15, y16, y17, y18, y19, y20, y21, y22, y23, y24, y25, y26, y27, y28, y29, y30]
}
}
struct SseF64Butterfly31<T> {
direction: FftDirection,
twiddles_re: [__m128d; 15],
twiddles_im: [__m128d; 15],
_phantom: std::marker::PhantomData<T>,
}
boilerplate_fft_sse_f64_butterfly!(SseF64Butterfly31, 31, |this: &SseF64Butterfly31<_>| this.direction);
impl<T: FftNum> SseF64Butterfly31<T> {
#[target_feature(enable = "sse4.1")]
unsafe fn new(direction: FftDirection) -> Self {
assert_f64::<T>();
let twiddles = make_twiddles(31, direction);
unsafe {Self {
direction,
twiddles_re: twiddles.map(|t| SseVector::broadcast_scalar(t.re)),
twiddles_im: twiddles.map(|t| SseVector::broadcast_scalar(t.im)),
_phantom: std::marker::PhantomData,
}}
}
#[inline(always)]
pub(crate) unsafe fn perform_fft_contiguous(&self, mut buffer: impl SseArrayMut<f64>) {
let values = read_complex_to_array!(buffer, { 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30 });
let out = self.perform_fft_direct(values);
write_complex_to_array!(out, buffer, { 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30 });
}
#[inline(always)]
pub(crate) unsafe fn perform_fft_direct(&self, values: [__m128d; 31]) -> [__m128d; 31] {
let rotate = SseVector::make_rotate90(FftDirection::Inverse);
let y00 = values[0];
let [x1p30, x1m30] = SseVector::column_butterfly2([values[1], values[30]]);
let x1m30 = SseVector::apply_rotate90(rotate, x1m30);
let y00 = SseVector::add(y00, x1p30);
let [x2p29, x2m29] = SseVector::column_butterfly2([values[2], values[29]]);
let x2m29 = SseVector::apply_rotate90(rotate, x2m29);
let y00 = SseVector::add(y00, x2p29);
let [x3p28, x3m28] = SseVector::column_butterfly2([values[3], values[28]]);
let x3m28 = SseVector::apply_rotate90(rotate, x3m28);
let y00 = SseVector::add(y00, x3p28);
let [x4p27, x4m27] = SseVector::column_butterfly2([values[4], values[27]]);
let x4m27 = SseVector::apply_rotate90(rotate, x4m27);
let y00 = SseVector::add(y00, x4p27);
let [x5p26, x5m26] = SseVector::column_butterfly2([values[5], values[26]]);
let x5m26 = SseVector::apply_rotate90(rotate, x5m26);
let y00 = SseVector::add(y00, x5p26);
let [x6p25, x6m25] = SseVector::column_butterfly2([values[6], values[25]]);
let x6m25 = SseVector::apply_rotate90(rotate, x6m25);
let y00 = SseVector::add(y00, x6p25);
let [x7p24, x7m24] = SseVector::column_butterfly2([values[7], values[24]]);
let x7m24 = SseVector::apply_rotate90(rotate, x7m24);
let y00 = SseVector::add(y00, x7p24);
let [x8p23, x8m23] = SseVector::column_butterfly2([values[8], values[23]]);
let x8m23 = SseVector::apply_rotate90(rotate, x8m23);
let y00 = SseVector::add(y00, x8p23);
let [x9p22, x9m22] = SseVector::column_butterfly2([values[9], values[22]]);
let x9m22 = SseVector::apply_rotate90(rotate, x9m22);
let y00 = SseVector::add(y00, x9p22);
let [x10p21, x10m21] = SseVector::column_butterfly2([values[10], values[21]]);
let x10m21 = SseVector::apply_rotate90(rotate, x10m21);
let y00 = SseVector::add(y00, x10p21);
let [x11p20, x11m20] = SseVector::column_butterfly2([values[11], values[20]]);
let x11m20 = SseVector::apply_rotate90(rotate, x11m20);
let y00 = SseVector::add(y00, x11p20);
let [x12p19, x12m19] = SseVector::column_butterfly2([values[12], values[19]]);
let x12m19 = SseVector::apply_rotate90(rotate, x12m19);
let y00 = SseVector::add(y00, x12p19);
let [x13p18, x13m18] = SseVector::column_butterfly2([values[13], values[18]]);
let x13m18 = SseVector::apply_rotate90(rotate, x13m18);
let y00 = SseVector::add(y00, x13p18);
let [x14p17, x14m17] = SseVector::column_butterfly2([values[14], values[17]]);
let x14m17 = SseVector::apply_rotate90(rotate, x14m17);
let y00 = SseVector::add(y00, x14p17);
let [x15p16, x15m16] = SseVector::column_butterfly2([values[15], values[16]]);
let x15m16 = SseVector::apply_rotate90(rotate, x15m16);
let y00 = SseVector::add(y00, x15p16);
let m0130a = SseVector::fmadd(values[0], self.twiddles_re[0], x1p30);
let m0130a = SseVector::fmadd(m0130a, self.twiddles_re[1], x2p29);
let m0130a = SseVector::fmadd(m0130a, self.twiddles_re[2], x3p28);
let m0130a = SseVector::fmadd(m0130a, self.twiddles_re[3], x4p27);
let m0130a = SseVector::fmadd(m0130a, self.twiddles_re[4], x5p26);
let m0130a = SseVector::fmadd(m0130a, self.twiddles_re[5], x6p25);
let m0130a = SseVector::fmadd(m0130a, self.twiddles_re[6], x7p24);
let m0130a = SseVector::fmadd(m0130a, self.twiddles_re[7], x8p23);
let m0130a = SseVector::fmadd(m0130a, self.twiddles_re[8], x9p22);
let m0130a = SseVector::fmadd(m0130a, self.twiddles_re[9], x10p21);
let m0130a = SseVector::fmadd(m0130a, self.twiddles_re[10], x11p20);
let m0130a = SseVector::fmadd(m0130a, self.twiddles_re[11], x12p19);
let m0130a = SseVector::fmadd(m0130a, self.twiddles_re[12], x13p18);
let m0130a = SseVector::fmadd(m0130a, self.twiddles_re[13], x14p17);
let m0130a = SseVector::fmadd(m0130a, self.twiddles_re[14], x15p16);
let m0130b = SseVector::mul(self.twiddles_im[0], x1m30);
let m0130b = SseVector::fmadd(m0130b, self.twiddles_im[1], x2m29);
let m0130b = SseVector::fmadd(m0130b, self.twiddles_im[2], x3m28);
let m0130b = SseVector::fmadd(m0130b, self.twiddles_im[3], x4m27);
let m0130b = SseVector::fmadd(m0130b, self.twiddles_im[4], x5m26);
let m0130b = SseVector::fmadd(m0130b, self.twiddles_im[5], x6m25);
let m0130b = SseVector::fmadd(m0130b, self.twiddles_im[6], x7m24);
let m0130b = SseVector::fmadd(m0130b, self.twiddles_im[7], x8m23);
let m0130b = SseVector::fmadd(m0130b, self.twiddles_im[8], x9m22);
let m0130b = SseVector::fmadd(m0130b, self.twiddles_im[9], x10m21);
let m0130b = SseVector::fmadd(m0130b, self.twiddles_im[10], x11m20);
let m0130b = SseVector::fmadd(m0130b, self.twiddles_im[11], x12m19);
let m0130b = SseVector::fmadd(m0130b, self.twiddles_im[12], x13m18);
let m0130b = SseVector::fmadd(m0130b, self.twiddles_im[13], x14m17);
let m0130b = SseVector::fmadd(m0130b, self.twiddles_im[14], x15m16);
let [y01, y30] = SseVector::column_butterfly2([m0130a, m0130b]);
let m0229a = SseVector::fmadd(values[0], self.twiddles_re[1], x1p30);
let m0229a = SseVector::fmadd(m0229a, self.twiddles_re[3], x2p29);
let m0229a = SseVector::fmadd(m0229a, self.twiddles_re[5], x3p28);
let m0229a = SseVector::fmadd(m0229a, self.twiddles_re[7], x4p27);
let m0229a = SseVector::fmadd(m0229a, self.twiddles_re[9], x5p26);
let m0229a = SseVector::fmadd(m0229a, self.twiddles_re[11], x6p25);
let m0229a = SseVector::fmadd(m0229a, self.twiddles_re[13], x7p24);
let m0229a = SseVector::fmadd(m0229a, self.twiddles_re[14], x8p23);
let m0229a = SseVector::fmadd(m0229a, self.twiddles_re[12], x9p22);
let m0229a = SseVector::fmadd(m0229a, self.twiddles_re[10], x10p21);
let m0229a = SseVector::fmadd(m0229a, self.twiddles_re[8], x11p20);
let m0229a = SseVector::fmadd(m0229a, self.twiddles_re[6], x12p19);
let m0229a = SseVector::fmadd(m0229a, self.twiddles_re[4], x13p18);
let m0229a = SseVector::fmadd(m0229a, self.twiddles_re[2], x14p17);
let m0229a = SseVector::fmadd(m0229a, self.twiddles_re[0], x15p16);
let m0229b = SseVector::mul(self.twiddles_im[1], x1m30);
let m0229b = SseVector::fmadd(m0229b, self.twiddles_im[3], x2m29);
let m0229b = SseVector::fmadd(m0229b, self.twiddles_im[5], x3m28);
let m0229b = SseVector::fmadd(m0229b, self.twiddles_im[7], x4m27);
let m0229b = SseVector::fmadd(m0229b, self.twiddles_im[9], x5m26);
let m0229b = SseVector::fmadd(m0229b, self.twiddles_im[11], x6m25);
let m0229b = SseVector::fmadd(m0229b, self.twiddles_im[13], x7m24);
let m0229b = SseVector::nmadd(m0229b, self.twiddles_im[14], x8m23);
let m0229b = SseVector::nmadd(m0229b, self.twiddles_im[12], x9m22);
let m0229b = SseVector::nmadd(m0229b, self.twiddles_im[10], x10m21);
let m0229b = SseVector::nmadd(m0229b, self.twiddles_im[8], x11m20);
let m0229b = SseVector::nmadd(m0229b, self.twiddles_im[6], x12m19);
let m0229b = SseVector::nmadd(m0229b, self.twiddles_im[4], x13m18);
let m0229b = SseVector::nmadd(m0229b, self.twiddles_im[2], x14m17);
let m0229b = SseVector::nmadd(m0229b, self.twiddles_im[0], x15m16);
let [y02, y29] = SseVector::column_butterfly2([m0229a, m0229b]);
let m0328a = SseVector::fmadd(values[0], self.twiddles_re[2], x1p30);
let m0328a = SseVector::fmadd(m0328a, self.twiddles_re[5], x2p29);
let m0328a = SseVector::fmadd(m0328a, self.twiddles_re[8], x3p28);
let m0328a = SseVector::fmadd(m0328a, self.twiddles_re[11], x4p27);
let m0328a = SseVector::fmadd(m0328a, self.twiddles_re[14], x5p26);
let m0328a = SseVector::fmadd(m0328a, self.twiddles_re[12], x6p25);
let m0328a = SseVector::fmadd(m0328a, self.twiddles_re[9], x7p24);
let m0328a = SseVector::fmadd(m0328a, self.twiddles_re[6], x8p23);
let m0328a = SseVector::fmadd(m0328a, self.twiddles_re[3], x9p22);
let m0328a = SseVector::fmadd(m0328a, self.twiddles_re[0], x10p21);
let m0328a = SseVector::fmadd(m0328a, self.twiddles_re[1], x11p20);
let m0328a = SseVector::fmadd(m0328a, self.twiddles_re[4], x12p19);
let m0328a = SseVector::fmadd(m0328a, self.twiddles_re[7], x13p18);
let m0328a = SseVector::fmadd(m0328a, self.twiddles_re[10], x14p17);
let m0328a = SseVector::fmadd(m0328a, self.twiddles_re[13], x15p16);
let m0328b = SseVector::mul(self.twiddles_im[2], x1m30);
let m0328b = SseVector::fmadd(m0328b, self.twiddles_im[5], x2m29);
let m0328b = SseVector::fmadd(m0328b, self.twiddles_im[8], x3m28);
let m0328b = SseVector::fmadd(m0328b, self.twiddles_im[11], x4m27);
let m0328b = SseVector::fmadd(m0328b, self.twiddles_im[14], x5m26);
let m0328b = SseVector::nmadd(m0328b, self.twiddles_im[12], x6m25);
let m0328b = SseVector::nmadd(m0328b, self.twiddles_im[9], x7m24);
let m0328b = SseVector::nmadd(m0328b, self.twiddles_im[6], x8m23);
let m0328b = SseVector::nmadd(m0328b, self.twiddles_im[3], x9m22);
let m0328b = SseVector::nmadd(m0328b, self.twiddles_im[0], x10m21);
let m0328b = SseVector::fmadd(m0328b, self.twiddles_im[1], x11m20);
let m0328b = SseVector::fmadd(m0328b, self.twiddles_im[4], x12m19);
let m0328b = SseVector::fmadd(m0328b, self.twiddles_im[7], x13m18);
let m0328b = SseVector::fmadd(m0328b, self.twiddles_im[10], x14m17);
let m0328b = SseVector::fmadd(m0328b, self.twiddles_im[13], x15m16);
let [y03, y28] = SseVector::column_butterfly2([m0328a, m0328b]);
let m0427a = SseVector::fmadd(values[0], self.twiddles_re[3], x1p30);
let m0427a = SseVector::fmadd(m0427a, self.twiddles_re[7], x2p29);
let m0427a = SseVector::fmadd(m0427a, self.twiddles_re[11], x3p28);
let m0427a = SseVector::fmadd(m0427a, self.twiddles_re[14], x4p27);
let m0427a = SseVector::fmadd(m0427a, self.twiddles_re[10], x5p26);
let m0427a = SseVector::fmadd(m0427a, self.twiddles_re[6], x6p25);
let m0427a = SseVector::fmadd(m0427a, self.twiddles_re[2], x7p24);
let m0427a = SseVector::fmadd(m0427a, self.twiddles_re[0], x8p23);
let m0427a = SseVector::fmadd(m0427a, self.twiddles_re[4], x9p22);
let m0427a = SseVector::fmadd(m0427a, self.twiddles_re[8], x10p21);
let m0427a = SseVector::fmadd(m0427a, self.twiddles_re[12], x11p20);
let m0427a = SseVector::fmadd(m0427a, self.twiddles_re[13], x12p19);
let m0427a = SseVector::fmadd(m0427a, self.twiddles_re[9], x13p18);
let m0427a = SseVector::fmadd(m0427a, self.twiddles_re[5], x14p17);
let m0427a = SseVector::fmadd(m0427a, self.twiddles_re[1], x15p16);
let m0427b = SseVector::mul(self.twiddles_im[3], x1m30);
let m0427b = SseVector::fmadd(m0427b, self.twiddles_im[7], x2m29);
let m0427b = SseVector::fmadd(m0427b, self.twiddles_im[11], x3m28);
let m0427b = SseVector::nmadd(m0427b, self.twiddles_im[14], x4m27);
let m0427b = SseVector::nmadd(m0427b, self.twiddles_im[10], x5m26);
let m0427b = SseVector::nmadd(m0427b, self.twiddles_im[6], x6m25);
let m0427b = SseVector::nmadd(m0427b, self.twiddles_im[2], x7m24);
let m0427b = SseVector::fmadd(m0427b, self.twiddles_im[0], x8m23);
let m0427b = SseVector::fmadd(m0427b, self.twiddles_im[4], x9m22);
let m0427b = SseVector::fmadd(m0427b, self.twiddles_im[8], x10m21);
let m0427b = SseVector::fmadd(m0427b, self.twiddles_im[12], x11m20);
let m0427b = SseVector::nmadd(m0427b, self.twiddles_im[13], x12m19);
let m0427b = SseVector::nmadd(m0427b, self.twiddles_im[9], x13m18);
let m0427b = SseVector::nmadd(m0427b, self.twiddles_im[5], x14m17);
let m0427b = SseVector::nmadd(m0427b, self.twiddles_im[1], x15m16);
let [y04, y27] = SseVector::column_butterfly2([m0427a, m0427b]);
let m0526a = SseVector::fmadd(values[0], self.twiddles_re[4], x1p30);
let m0526a = SseVector::fmadd(m0526a, self.twiddles_re[9], x2p29);
let m0526a = SseVector::fmadd(m0526a, self.twiddles_re[14], x3p28);
let m0526a = SseVector::fmadd(m0526a, self.twiddles_re[10], x4p27);
let m0526a = SseVector::fmadd(m0526a, self.twiddles_re[5], x5p26);
let m0526a = SseVector::fmadd(m0526a, self.twiddles_re[0], x6p25);
let m0526a = SseVector::fmadd(m0526a, self.twiddles_re[3], x7p24);
let m0526a = SseVector::fmadd(m0526a, self.twiddles_re[8], x8p23);
let m0526a = SseVector::fmadd(m0526a, self.twiddles_re[13], x9p22);
let m0526a = SseVector::fmadd(m0526a, self.twiddles_re[11], x10p21);
let m0526a = SseVector::fmadd(m0526a, self.twiddles_re[6], x11p20);
let m0526a = SseVector::fmadd(m0526a, self.twiddles_re[1], x12p19);
let m0526a = SseVector::fmadd(m0526a, self.twiddles_re[2], x13p18);
let m0526a = SseVector::fmadd(m0526a, self.twiddles_re[7], x14p17);
let m0526a = SseVector::fmadd(m0526a, self.twiddles_re[12], x15p16);
let m0526b = SseVector::mul(self.twiddles_im[4], x1m30);
let m0526b = SseVector::fmadd(m0526b, self.twiddles_im[9], x2m29);
let m0526b = SseVector::fmadd(m0526b, self.twiddles_im[14], x3m28);
let m0526b = SseVector::nmadd(m0526b, self.twiddles_im[10], x4m27);
let m0526b = SseVector::nmadd(m0526b, self.twiddles_im[5], x5m26);
let m0526b = SseVector::nmadd(m0526b, self.twiddles_im[0], x6m25);
let m0526b = SseVector::fmadd(m0526b, self.twiddles_im[3], x7m24);
let m0526b = SseVector::fmadd(m0526b, self.twiddles_im[8], x8m23);
let m0526b = SseVector::fmadd(m0526b, self.twiddles_im[13], x9m22);
let m0526b = SseVector::nmadd(m0526b, self.twiddles_im[11], x10m21);
let m0526b = SseVector::nmadd(m0526b, self.twiddles_im[6], x11m20);
let m0526b = SseVector::nmadd(m0526b, self.twiddles_im[1], x12m19);
let m0526b = SseVector::fmadd(m0526b, self.twiddles_im[2], x13m18);
let m0526b = SseVector::fmadd(m0526b, self.twiddles_im[7], x14m17);
let m0526b = SseVector::fmadd(m0526b, self.twiddles_im[12], x15m16);
let [y05, y26] = SseVector::column_butterfly2([m0526a, m0526b]);
let m0625a = SseVector::fmadd(values[0], self.twiddles_re[5], x1p30);
let m0625a = SseVector::fmadd(m0625a, self.twiddles_re[11], x2p29);
let m0625a = SseVector::fmadd(m0625a, self.twiddles_re[12], x3p28);
let m0625a = SseVector::fmadd(m0625a, self.twiddles_re[6], x4p27);
let m0625a = SseVector::fmadd(m0625a, self.twiddles_re[0], x5p26);
let m0625a = SseVector::fmadd(m0625a, self.twiddles_re[4], x6p25);
let m0625a = SseVector::fmadd(m0625a, self.twiddles_re[10], x7p24);
let m0625a = SseVector::fmadd(m0625a, self.twiddles_re[13], x8p23);
let m0625a = SseVector::fmadd(m0625a, self.twiddles_re[7], x9p22);
let m0625a = SseVector::fmadd(m0625a, self.twiddles_re[1], x10p21);
let m0625a = SseVector::fmadd(m0625a, self.twiddles_re[3], x11p20);
let m0625a = SseVector::fmadd(m0625a, self.twiddles_re[9], x12p19);
let m0625a = SseVector::fmadd(m0625a, self.twiddles_re[14], x13p18);
let m0625a = SseVector::fmadd(m0625a, self.twiddles_re[8], x14p17);
let m0625a = SseVector::fmadd(m0625a, self.twiddles_re[2], x15p16);
let m0625b = SseVector::mul(self.twiddles_im[5], x1m30);
let m0625b = SseVector::fmadd(m0625b, self.twiddles_im[11], x2m29);
let m0625b = SseVector::nmadd(m0625b, self.twiddles_im[12], x3m28);
let m0625b = SseVector::nmadd(m0625b, self.twiddles_im[6], x4m27);
let m0625b = SseVector::nmadd(m0625b, self.twiddles_im[0], x5m26);
let m0625b = SseVector::fmadd(m0625b, self.twiddles_im[4], x6m25);
let m0625b = SseVector::fmadd(m0625b, self.twiddles_im[10], x7m24);
let m0625b = SseVector::nmadd(m0625b, self.twiddles_im[13], x8m23);
let m0625b = SseVector::nmadd(m0625b, self.twiddles_im[7], x9m22);
let m0625b = SseVector::nmadd(m0625b, self.twiddles_im[1], x10m21);
let m0625b = SseVector::fmadd(m0625b, self.twiddles_im[3], x11m20);
let m0625b = SseVector::fmadd(m0625b, self.twiddles_im[9], x12m19);
let m0625b = SseVector::nmadd(m0625b, self.twiddles_im[14], x13m18);
let m0625b = SseVector::nmadd(m0625b, self.twiddles_im[8], x14m17);
let m0625b = SseVector::nmadd(m0625b, self.twiddles_im[2], x15m16);
let [y06, y25] = SseVector::column_butterfly2([m0625a, m0625b]);
let m0724a = SseVector::fmadd(values[0], self.twiddles_re[6], x1p30);
let m0724a = SseVector::fmadd(m0724a, self.twiddles_re[13], x2p29);
let m0724a = SseVector::fmadd(m0724a, self.twiddles_re[9], x3p28);
let m0724a = SseVector::fmadd(m0724a, self.twiddles_re[2], x4p27);
let m0724a = SseVector::fmadd(m0724a, self.twiddles_re[3], x5p26);
let m0724a = SseVector::fmadd(m0724a, self.twiddles_re[10], x6p25);
let m0724a = SseVector::fmadd(m0724a, self.twiddles_re[12], x7p24);
let m0724a = SseVector::fmadd(m0724a, self.twiddles_re[5], x8p23);
let m0724a = SseVector::fmadd(m0724a, self.twiddles_re[0], x9p22);
let m0724a = SseVector::fmadd(m0724a, self.twiddles_re[7], x10p21);
let m0724a = SseVector::fmadd(m0724a, self.twiddles_re[14], x11p20);
let m0724a = SseVector::fmadd(m0724a, self.twiddles_re[8], x12p19);
let m0724a = SseVector::fmadd(m0724a, self.twiddles_re[1], x13p18);
let m0724a = SseVector::fmadd(m0724a, self.twiddles_re[4], x14p17);
let m0724a = SseVector::fmadd(m0724a, self.twiddles_re[11], x15p16);
let m0724b = SseVector::mul(self.twiddles_im[6], x1m30);
let m0724b = SseVector::fmadd(m0724b, self.twiddles_im[13], x2m29);
let m0724b = SseVector::nmadd(m0724b, self.twiddles_im[9], x3m28);
let m0724b = SseVector::nmadd(m0724b, self.twiddles_im[2], x4m27);
let m0724b = SseVector::fmadd(m0724b, self.twiddles_im[3], x5m26);
let m0724b = SseVector::fmadd(m0724b, self.twiddles_im[10], x6m25);
let m0724b = SseVector::nmadd(m0724b, self.twiddles_im[12], x7m24);
let m0724b = SseVector::nmadd(m0724b, self.twiddles_im[5], x8m23);
let m0724b = SseVector::fmadd(m0724b, self.twiddles_im[0], x9m22);
let m0724b = SseVector::fmadd(m0724b, self.twiddles_im[7], x10m21);
let m0724b = SseVector::fmadd(m0724b, self.twiddles_im[14], x11m20);
let m0724b = SseVector::nmadd(m0724b, self.twiddles_im[8], x12m19);
let m0724b = SseVector::nmadd(m0724b, self.twiddles_im[1], x13m18);
let m0724b = SseVector::fmadd(m0724b, self.twiddles_im[4], x14m17);
let m0724b = SseVector::fmadd(m0724b, self.twiddles_im[11], x15m16);
let [y07, y24] = SseVector::column_butterfly2([m0724a, m0724b]);
let m0823a = SseVector::fmadd(values[0], self.twiddles_re[7], x1p30);
let m0823a = SseVector::fmadd(m0823a, self.twiddles_re[14], x2p29);
let m0823a = SseVector::fmadd(m0823a, self.twiddles_re[6], x3p28);
let m0823a = SseVector::fmadd(m0823a, self.twiddles_re[0], x4p27);
let m0823a = SseVector::fmadd(m0823a, self.twiddles_re[8], x5p26);
let m0823a = SseVector::fmadd(m0823a, self.twiddles_re[13], x6p25);
let m0823a = SseVector::fmadd(m0823a, self.twiddles_re[5], x7p24);
let m0823a = SseVector::fmadd(m0823a, self.twiddles_re[1], x8p23);
let m0823a = SseVector::fmadd(m0823a, self.twiddles_re[9], x9p22);
let m0823a = SseVector::fmadd(m0823a, self.twiddles_re[12], x10p21);
let m0823a = SseVector::fmadd(m0823a, self.twiddles_re[4], x11p20);
let m0823a = SseVector::fmadd(m0823a, self.twiddles_re[2], x12p19);
let m0823a = SseVector::fmadd(m0823a, self.twiddles_re[10], x13p18);
let m0823a = SseVector::fmadd(m0823a, self.twiddles_re[11], x14p17);
let m0823a = SseVector::fmadd(m0823a, self.twiddles_re[3], x15p16);
let m0823b = SseVector::mul(self.twiddles_im[7], x1m30);
let m0823b = SseVector::nmadd(m0823b, self.twiddles_im[14], x2m29);
let m0823b = SseVector::nmadd(m0823b, self.twiddles_im[6], x3m28);
let m0823b = SseVector::fmadd(m0823b, self.twiddles_im[0], x4m27);
let m0823b = SseVector::fmadd(m0823b, self.twiddles_im[8], x5m26);
let m0823b = SseVector::nmadd(m0823b, self.twiddles_im[13], x6m25);
let m0823b = SseVector::nmadd(m0823b, self.twiddles_im[5], x7m24);
let m0823b = SseVector::fmadd(m0823b, self.twiddles_im[1], x8m23);
let m0823b = SseVector::fmadd(m0823b, self.twiddles_im[9], x9m22);
let m0823b = SseVector::nmadd(m0823b, self.twiddles_im[12], x10m21);
let m0823b = SseVector::nmadd(m0823b, self.twiddles_im[4], x11m20);
let m0823b = SseVector::fmadd(m0823b, self.twiddles_im[2], x12m19);
let m0823b = SseVector::fmadd(m0823b, self.twiddles_im[10], x13m18);
let m0823b = SseVector::nmadd(m0823b, self.twiddles_im[11], x14m17);
let m0823b = SseVector::nmadd(m0823b, self.twiddles_im[3], x15m16);
let [y08, y23] = SseVector::column_butterfly2([m0823a, m0823b]);
let m0922a = SseVector::fmadd(values[0], self.twiddles_re[8], x1p30);
let m0922a = SseVector::fmadd(m0922a, self.twiddles_re[12], x2p29);
let m0922a = SseVector::fmadd(m0922a, self.twiddles_re[3], x3p28);
let m0922a = SseVector::fmadd(m0922a, self.twiddles_re[4], x4p27);
let m0922a = SseVector::fmadd(m0922a, self.twiddles_re[13], x5p26);
let m0922a = SseVector::fmadd(m0922a, self.twiddles_re[7], x6p25);
let m0922a = SseVector::fmadd(m0922a, self.twiddles_re[0], x7p24);
let m0922a = SseVector::fmadd(m0922a, self.twiddles_re[9], x8p23);
let m0922a = SseVector::fmadd(m0922a, self.twiddles_re[11], x9p22);
let m0922a = SseVector::fmadd(m0922a, self.twiddles_re[2], x10p21);
let m0922a = SseVector::fmadd(m0922a, self.twiddles_re[5], x11p20);
let m0922a = SseVector::fmadd(m0922a, self.twiddles_re[14], x12p19);
let m0922a = SseVector::fmadd(m0922a, self.twiddles_re[6], x13p18);
let m0922a = SseVector::fmadd(m0922a, self.twiddles_re[1], x14p17);
let m0922a = SseVector::fmadd(m0922a, self.twiddles_re[10], x15p16);
let m0922b = SseVector::mul(self.twiddles_im[8], x1m30);
let m0922b = SseVector::nmadd(m0922b, self.twiddles_im[12], x2m29);
let m0922b = SseVector::nmadd(m0922b, self.twiddles_im[3], x3m28);
let m0922b = SseVector::fmadd(m0922b, self.twiddles_im[4], x4m27);
let m0922b = SseVector::fmadd(m0922b, self.twiddles_im[13], x5m26);
let m0922b = SseVector::nmadd(m0922b, self.twiddles_im[7], x6m25);
let m0922b = SseVector::fmadd(m0922b, self.twiddles_im[0], x7m24);
let m0922b = SseVector::fmadd(m0922b, self.twiddles_im[9], x8m23);
let m0922b = SseVector::nmadd(m0922b, self.twiddles_im[11], x9m22);
let m0922b = SseVector::nmadd(m0922b, self.twiddles_im[2], x10m21);
let m0922b = SseVector::fmadd(m0922b, self.twiddles_im[5], x11m20);
let m0922b = SseVector::fmadd(m0922b, self.twiddles_im[14], x12m19);
let m0922b = SseVector::nmadd(m0922b, self.twiddles_im[6], x13m18);
let m0922b = SseVector::fmadd(m0922b, self.twiddles_im[1], x14m17);
let m0922b = SseVector::fmadd(m0922b, self.twiddles_im[10], x15m16);
let [y09, y22] = SseVector::column_butterfly2([m0922a, m0922b]);
let m1021a = SseVector::fmadd(values[0], self.twiddles_re[9], x1p30);
let m1021a = SseVector::fmadd(m1021a, self.twiddles_re[10], x2p29);
let m1021a = SseVector::fmadd(m1021a, self.twiddles_re[0], x3p28);
let m1021a = SseVector::fmadd(m1021a, self.twiddles_re[8], x4p27);
let m1021a = SseVector::fmadd(m1021a, self.twiddles_re[11], x5p26);
let m1021a = SseVector::fmadd(m1021a, self.twiddles_re[1], x6p25);
let m1021a = SseVector::fmadd(m1021a, self.twiddles_re[7], x7p24);
let m1021a = SseVector::fmadd(m1021a, self.twiddles_re[12], x8p23);
let m1021a = SseVector::fmadd(m1021a, self.twiddles_re[2], x9p22);
let m1021a = SseVector::fmadd(m1021a, self.twiddles_re[6], x10p21);
let m1021a = SseVector::fmadd(m1021a, self.twiddles_re[13], x11p20);
let m1021a = SseVector::fmadd(m1021a, self.twiddles_re[3], x12p19);
let m1021a = SseVector::fmadd(m1021a, self.twiddles_re[5], x13p18);
let m1021a = SseVector::fmadd(m1021a, self.twiddles_re[14], x14p17);
let m1021a = SseVector::fmadd(m1021a, self.twiddles_re[4], x15p16);
let m1021b = SseVector::mul(self.twiddles_im[9], x1m30);
let m1021b = SseVector::nmadd(m1021b, self.twiddles_im[10], x2m29);
let m1021b = SseVector::nmadd(m1021b, self.twiddles_im[0], x3m28);
let m1021b = SseVector::fmadd(m1021b, self.twiddles_im[8], x4m27);
let m1021b = SseVector::nmadd(m1021b, self.twiddles_im[11], x5m26);
let m1021b = SseVector::nmadd(m1021b, self.twiddles_im[1], x6m25);
let m1021b = SseVector::fmadd(m1021b, self.twiddles_im[7], x7m24);
let m1021b = SseVector::nmadd(m1021b, self.twiddles_im[12], x8m23);
let m1021b = SseVector::nmadd(m1021b, self.twiddles_im[2], x9m22);
let m1021b = SseVector::fmadd(m1021b, self.twiddles_im[6], x10m21);
let m1021b = SseVector::nmadd(m1021b, self.twiddles_im[13], x11m20);
let m1021b = SseVector::nmadd(m1021b, self.twiddles_im[3], x12m19);
let m1021b = SseVector::fmadd(m1021b, self.twiddles_im[5], x13m18);
let m1021b = SseVector::nmadd(m1021b, self.twiddles_im[14], x14m17);
let m1021b = SseVector::nmadd(m1021b, self.twiddles_im[4], x15m16);
let [y10, y21] = SseVector::column_butterfly2([m1021a, m1021b]);
let m1120a = SseVector::fmadd(values[0], self.twiddles_re[10], x1p30);
let m1120a = SseVector::fmadd(m1120a, self.twiddles_re[8], x2p29);
let m1120a = SseVector::fmadd(m1120a, self.twiddles_re[1], x3p28);
let m1120a = SseVector::fmadd(m1120a, self.twiddles_re[12], x4p27);
let m1120a = SseVector::fmadd(m1120a, self.twiddles_re[6], x5p26);
let m1120a = SseVector::fmadd(m1120a, self.twiddles_re[3], x6p25);
let m1120a = SseVector::fmadd(m1120a, self.twiddles_re[14], x7p24);
let m1120a = SseVector::fmadd(m1120a, self.twiddles_re[4], x8p23);
let m1120a = SseVector::fmadd(m1120a, self.twiddles_re[5], x9p22);
let m1120a = SseVector::fmadd(m1120a, self.twiddles_re[13], x10p21);
let m1120a = SseVector::fmadd(m1120a, self.twiddles_re[2], x11p20);
let m1120a = SseVector::fmadd(m1120a, self.twiddles_re[7], x12p19);
let m1120a = SseVector::fmadd(m1120a, self.twiddles_re[11], x13p18);
let m1120a = SseVector::fmadd(m1120a, self.twiddles_re[0], x14p17);
let m1120a = SseVector::fmadd(m1120a, self.twiddles_re[9], x15p16);
let m1120b = SseVector::mul(self.twiddles_im[10], x1m30);
let m1120b = SseVector::nmadd(m1120b, self.twiddles_im[8], x2m29);
let m1120b = SseVector::fmadd(m1120b, self.twiddles_im[1], x3m28);
let m1120b = SseVector::fmadd(m1120b, self.twiddles_im[12], x4m27);
let m1120b = SseVector::nmadd(m1120b, self.twiddles_im[6], x5m26);
let m1120b = SseVector::fmadd(m1120b, self.twiddles_im[3], x6m25);
let m1120b = SseVector::fmadd(m1120b, self.twiddles_im[14], x7m24);
let m1120b = SseVector::nmadd(m1120b, self.twiddles_im[4], x8m23);
let m1120b = SseVector::fmadd(m1120b, self.twiddles_im[5], x9m22);
let m1120b = SseVector::nmadd(m1120b, self.twiddles_im[13], x10m21);
let m1120b = SseVector::nmadd(m1120b, self.twiddles_im[2], x11m20);
let m1120b = SseVector::fmadd(m1120b, self.twiddles_im[7], x12m19);
let m1120b = SseVector::nmadd(m1120b, self.twiddles_im[11], x13m18);
let m1120b = SseVector::nmadd(m1120b, self.twiddles_im[0], x14m17);
let m1120b = SseVector::fmadd(m1120b, self.twiddles_im[9], x15m16);
let [y11, y20] = SseVector::column_butterfly2([m1120a, m1120b]);
let m1219a = SseVector::fmadd(values[0], self.twiddles_re[11], x1p30);
let m1219a = SseVector::fmadd(m1219a, self.twiddles_re[6], x2p29);
let m1219a = SseVector::fmadd(m1219a, self.twiddles_re[4], x3p28);
let m1219a = SseVector::fmadd(m1219a, self.twiddles_re[13], x4p27);
let m1219a = SseVector::fmadd(m1219a, self.twiddles_re[1], x5p26);
let m1219a = SseVector::fmadd(m1219a, self.twiddles_re[9], x6p25);
let m1219a = SseVector::fmadd(m1219a, self.twiddles_re[8], x7p24);
let m1219a = SseVector::fmadd(m1219a, self.twiddles_re[2], x8p23);
let m1219a = SseVector::fmadd(m1219a, self.twiddles_re[14], x9p22);
let m1219a = SseVector::fmadd(m1219a, self.twiddles_re[3], x10p21);
let m1219a = SseVector::fmadd(m1219a, self.twiddles_re[7], x11p20);
let m1219a = SseVector::fmadd(m1219a, self.twiddles_re[10], x12p19);
let m1219a = SseVector::fmadd(m1219a, self.twiddles_re[0], x13p18);
let m1219a = SseVector::fmadd(m1219a, self.twiddles_re[12], x14p17);
let m1219a = SseVector::fmadd(m1219a, self.twiddles_re[5], x15p16);
let m1219b = SseVector::mul(self.twiddles_im[11], x1m30);
let m1219b = SseVector::nmadd(m1219b, self.twiddles_im[6], x2m29);
let m1219b = SseVector::fmadd(m1219b, self.twiddles_im[4], x3m28);
let m1219b = SseVector::nmadd(m1219b, self.twiddles_im[13], x4m27);
let m1219b = SseVector::nmadd(m1219b, self.twiddles_im[1], x5m26);
let m1219b = SseVector::fmadd(m1219b, self.twiddles_im[9], x6m25);
let m1219b = SseVector::nmadd(m1219b, self.twiddles_im[8], x7m24);
let m1219b = SseVector::fmadd(m1219b, self.twiddles_im[2], x8m23);
let m1219b = SseVector::fmadd(m1219b, self.twiddles_im[14], x9m22);
let m1219b = SseVector::nmadd(m1219b, self.twiddles_im[3], x10m21);
let m1219b = SseVector::fmadd(m1219b, self.twiddles_im[7], x11m20);
let m1219b = SseVector::nmadd(m1219b, self.twiddles_im[10], x12m19);
let m1219b = SseVector::fmadd(m1219b, self.twiddles_im[0], x13m18);
let m1219b = SseVector::fmadd(m1219b, self.twiddles_im[12], x14m17);
let m1219b = SseVector::nmadd(m1219b, self.twiddles_im[5], x15m16);
let [y12, y19] = SseVector::column_butterfly2([m1219a, m1219b]);
let m1318a = SseVector::fmadd(values[0], self.twiddles_re[12], x1p30);
let m1318a = SseVector::fmadd(m1318a, self.twiddles_re[4], x2p29);
let m1318a = SseVector::fmadd(m1318a, self.twiddles_re[7], x3p28);
let m1318a = SseVector::fmadd(m1318a, self.twiddles_re[9], x4p27);
let m1318a = SseVector::fmadd(m1318a, self.twiddles_re[2], x5p26);
let m1318a = SseVector::fmadd(m1318a, self.twiddles_re[14], x6p25);
let m1318a = SseVector::fmadd(m1318a, self.twiddles_re[1], x7p24);
let m1318a = SseVector::fmadd(m1318a, self.twiddles_re[10], x8p23);
let m1318a = SseVector::fmadd(m1318a, self.twiddles_re[6], x9p22);
let m1318a = SseVector::fmadd(m1318a, self.twiddles_re[5], x10p21);
let m1318a = SseVector::fmadd(m1318a, self.twiddles_re[11], x11p20);
let m1318a = SseVector::fmadd(m1318a, self.twiddles_re[0], x12p19);
let m1318a = SseVector::fmadd(m1318a, self.twiddles_re[13], x13p18);
let m1318a = SseVector::fmadd(m1318a, self.twiddles_re[3], x14p17);
let m1318a = SseVector::fmadd(m1318a, self.twiddles_re[8], x15p16);
let m1318b = SseVector::mul(self.twiddles_im[12], x1m30);
let m1318b = SseVector::nmadd(m1318b, self.twiddles_im[4], x2m29);
let m1318b = SseVector::fmadd(m1318b, self.twiddles_im[7], x3m28);
let m1318b = SseVector::nmadd(m1318b, self.twiddles_im[9], x4m27);
let m1318b = SseVector::fmadd(m1318b, self.twiddles_im[2], x5m26);
let m1318b = SseVector::nmadd(m1318b, self.twiddles_im[14], x6m25);
let m1318b = SseVector::nmadd(m1318b, self.twiddles_im[1], x7m24);
let m1318b = SseVector::fmadd(m1318b, self.twiddles_im[10], x8m23);
let m1318b = SseVector::nmadd(m1318b, self.twiddles_im[6], x9m22);
let m1318b = SseVector::fmadd(m1318b, self.twiddles_im[5], x10m21);
let m1318b = SseVector::nmadd(m1318b, self.twiddles_im[11], x11m20);
let m1318b = SseVector::fmadd(m1318b, self.twiddles_im[0], x12m19);
let m1318b = SseVector::fmadd(m1318b, self.twiddles_im[13], x13m18);
let m1318b = SseVector::nmadd(m1318b, self.twiddles_im[3], x14m17);
let m1318b = SseVector::fmadd(m1318b, self.twiddles_im[8], x15m16);
let [y13, y18] = SseVector::column_butterfly2([m1318a, m1318b]);
let m1417a = SseVector::fmadd(values[0], self.twiddles_re[13], x1p30);
let m1417a = SseVector::fmadd(m1417a, self.twiddles_re[2], x2p29);
let m1417a = SseVector::fmadd(m1417a, self.twiddles_re[10], x3p28);
let m1417a = SseVector::fmadd(m1417a, self.twiddles_re[5], x4p27);
let m1417a = SseVector::fmadd(m1417a, self.twiddles_re[7], x5p26);
let m1417a = SseVector::fmadd(m1417a, self.twiddles_re[8], x6p25);
let m1417a = SseVector::fmadd(m1417a, self.twiddles_re[4], x7p24);
let m1417a = SseVector::fmadd(m1417a, self.twiddles_re[11], x8p23);
let m1417a = SseVector::fmadd(m1417a, self.twiddles_re[1], x9p22);
let m1417a = SseVector::fmadd(m1417a, self.twiddles_re[14], x10p21);
let m1417a = SseVector::fmadd(m1417a, self.twiddles_re[0], x11p20);
let m1417a = SseVector::fmadd(m1417a, self.twiddles_re[12], x12p19);
let m1417a = SseVector::fmadd(m1417a, self.twiddles_re[3], x13p18);
let m1417a = SseVector::fmadd(m1417a, self.twiddles_re[9], x14p17);
let m1417a = SseVector::fmadd(m1417a, self.twiddles_re[6], x15p16);
let m1417b = SseVector::mul(self.twiddles_im[13], x1m30);
let m1417b = SseVector::nmadd(m1417b, self.twiddles_im[2], x2m29);
let m1417b = SseVector::fmadd(m1417b, self.twiddles_im[10], x3m28);
let m1417b = SseVector::nmadd(m1417b, self.twiddles_im[5], x4m27);
let m1417b = SseVector::fmadd(m1417b, self.twiddles_im[7], x5m26);
let m1417b = SseVector::nmadd(m1417b, self.twiddles_im[8], x6m25);
let m1417b = SseVector::fmadd(m1417b, self.twiddles_im[4], x7m24);
let m1417b = SseVector::nmadd(m1417b, self.twiddles_im[11], x8m23);
let m1417b = SseVector::fmadd(m1417b, self.twiddles_im[1], x9m22);
let m1417b = SseVector::nmadd(m1417b, self.twiddles_im[14], x10m21);
let m1417b = SseVector::nmadd(m1417b, self.twiddles_im[0], x11m20);
let m1417b = SseVector::fmadd(m1417b, self.twiddles_im[12], x12m19);
let m1417b = SseVector::nmadd(m1417b, self.twiddles_im[3], x13m18);
let m1417b = SseVector::fmadd(m1417b, self.twiddles_im[9], x14m17);
let m1417b = SseVector::nmadd(m1417b, self.twiddles_im[6], x15m16);
let [y14, y17] = SseVector::column_butterfly2([m1417a, m1417b]);
let m1516a = SseVector::fmadd(values[0], self.twiddles_re[14], x1p30);
let m1516a = SseVector::fmadd(m1516a, self.twiddles_re[0], x2p29);
let m1516a = SseVector::fmadd(m1516a, self.twiddles_re[13], x3p28);
let m1516a = SseVector::fmadd(m1516a, self.twiddles_re[1], x4p27);
let m1516a = SseVector::fmadd(m1516a, self.twiddles_re[12], x5p26);
let m1516a = SseVector::fmadd(m1516a, self.twiddles_re[2], x6p25);
let m1516a = SseVector::fmadd(m1516a, self.twiddles_re[11], x7p24);
let m1516a = SseVector::fmadd(m1516a, self.twiddles_re[3], x8p23);
let m1516a = SseVector::fmadd(m1516a, self.twiddles_re[10], x9p22);
let m1516a = SseVector::fmadd(m1516a, self.twiddles_re[4], x10p21);
let m1516a = SseVector::fmadd(m1516a, self.twiddles_re[9], x11p20);
let m1516a = SseVector::fmadd(m1516a, self.twiddles_re[5], x12p19);
let m1516a = SseVector::fmadd(m1516a, self.twiddles_re[8], x13p18);
let m1516a = SseVector::fmadd(m1516a, self.twiddles_re[6], x14p17);
let m1516a = SseVector::fmadd(m1516a, self.twiddles_re[7], x15p16);
let m1516b = SseVector::mul(self.twiddles_im[14], x1m30);
let m1516b = SseVector::nmadd(m1516b, self.twiddles_im[0], x2m29);
let m1516b = SseVector::fmadd(m1516b, self.twiddles_im[13], x3m28);
let m1516b = SseVector::nmadd(m1516b, self.twiddles_im[1], x4m27);
let m1516b = SseVector::fmadd(m1516b, self.twiddles_im[12], x5m26);
let m1516b = SseVector::nmadd(m1516b, self.twiddles_im[2], x6m25);
let m1516b = SseVector::fmadd(m1516b, self.twiddles_im[11], x7m24);
let m1516b = SseVector::nmadd(m1516b, self.twiddles_im[3], x8m23);
let m1516b = SseVector::fmadd(m1516b, self.twiddles_im[10], x9m22);
let m1516b = SseVector::nmadd(m1516b, self.twiddles_im[4], x10m21);
let m1516b = SseVector::fmadd(m1516b, self.twiddles_im[9], x11m20);
let m1516b = SseVector::nmadd(m1516b, self.twiddles_im[5], x12m19);
let m1516b = SseVector::fmadd(m1516b, self.twiddles_im[8], x13m18);
let m1516b = SseVector::nmadd(m1516b, self.twiddles_im[6], x14m17);
let m1516b = SseVector::fmadd(m1516b, self.twiddles_im[7], x15m16);
let [y15, y16] = SseVector::column_butterfly2([m1516a, m1516b]);
[y00, y01, y02, y03, y04, y05, y06, y07, y08, y09, y10, y11, y12, y13, y14, y15, y16, y17, y18, y19, y20, y21, y22, y23, y24, y25, y26, y27, y28, y29, y30]
}
}
#[cfg(test)]
mod unit_tests {
use super::*;
use crate::test_utils::check_fft_algorithm;
macro_rules! test_butterfly_32_func {
($test_name:ident, $struct_name:ident, $size:expr) => {
#[test]
fn $test_name() {
assert!(std::arch::is_x86_feature_detected!("sse4.1"));
let fwd = unsafe { $struct_name::new(FftDirection::Forward) };
check_fft_algorithm::<f32>(&fwd, $size, FftDirection::Forward);
let inv = unsafe { $struct_name::new(FftDirection::Inverse) };
check_fft_algorithm::<f32>(&inv, $size, FftDirection::Inverse);
}
};
}
macro_rules! test_butterfly_64_func {
($test_name:ident, $struct_name:ident, $size:expr) => {
#[test]
fn $test_name() {
assert!(std::arch::is_x86_feature_detected!("sse4.1"));
let fwd = unsafe { $struct_name::new(FftDirection::Forward) };
check_fft_algorithm::<f64>(&fwd, $size, FftDirection::Forward);
let inv = unsafe { $struct_name::new(FftDirection::Inverse) };
check_fft_algorithm::<f64>(&inv, $size, FftDirection::Inverse);
}
};
}
test_butterfly_32_func!(test_ssef32_butterfly7, SseF32Butterfly7, 7);
test_butterfly_32_func!(test_ssef32_butterfly11, SseF32Butterfly11, 11);
test_butterfly_32_func!(test_ssef32_butterfly13, SseF32Butterfly13, 13);
test_butterfly_32_func!(test_ssef32_butterfly17, SseF32Butterfly17, 17);
test_butterfly_32_func!(test_ssef32_butterfly19, SseF32Butterfly19, 19);
test_butterfly_32_func!(test_ssef32_butterfly23, SseF32Butterfly23, 23);
test_butterfly_32_func!(test_ssef32_butterfly29, SseF32Butterfly29, 29);
test_butterfly_32_func!(test_ssef32_butterfly31, SseF32Butterfly31, 31);
test_butterfly_64_func!(test_ssef64_butterfly7, SseF64Butterfly7, 7);
test_butterfly_64_func!(test_ssef64_butterfly11, SseF64Butterfly11, 11);
test_butterfly_64_func!(test_ssef64_butterfly13, SseF64Butterfly13, 13);
test_butterfly_64_func!(test_ssef64_butterfly17, SseF64Butterfly17, 17);
test_butterfly_64_func!(test_ssef64_butterfly19, SseF64Butterfly19, 19);
test_butterfly_64_func!(test_ssef64_butterfly23, SseF64Butterfly23, 23);
test_butterfly_64_func!(test_ssef64_butterfly29, SseF64Butterfly29, 29);
test_butterfly_64_func!(test_ssef64_butterfly31, SseF64Butterfly31, 31);
}