rustfft/algorithm/
butterflies.rs

1use num_complex::Complex;
2
3use crate::{common::FftNum, FftDirection};
4
5use crate::array_utils::{DoubleBuf, LoadStore};
6use crate::twiddles;
7use crate::{Direction, Fft, Length};
8
9#[allow(unused)]
10macro_rules! boilerplate_fft_butterfly {
11    ($struct_name:ident, $len:expr, $direction_fn:expr) => {
12        impl<T: FftNum> $struct_name<T> {
13            #[inline(always)]
14            pub(crate) unsafe fn perform_fft_butterfly(&self, buffer: impl LoadStore<T>) {
15                self.perform_fft_contiguous(buffer);
16            }
17        }
18        impl<T: FftNum> Fft<T> for $struct_name<T> {
19            #[inline]
20            fn process_immutable_with_scratch(
21                &self,
22                input: &[Complex<T>],
23                output: &mut [Complex<T>],
24                _scratch: &mut [Complex<T>],
25            ) {
26                crate::fft_helper::fft_helper_immut(
27                    input,
28                    output,
29                    &mut [],
30                    self.len(),
31                    0,
32                    |in_chunk, out_chunk, _| unsafe {
33                        self.perform_fft_butterfly(DoubleBuf {
34                            input: in_chunk,
35                            output: out_chunk,
36                        })
37                    },
38                );
39            }
40            fn process_outofplace_with_scratch(
41                &self,
42                input: &mut [Complex<T>],
43                output: &mut [Complex<T>],
44                _scratch: &mut [Complex<T>],
45            ) {
46                crate::fft_helper::fft_helper_outofplace(
47                    input,
48                    output,
49                    &mut [],
50                    self.len(),
51                    0,
52                    |in_chunk, out_chunk, _| unsafe {
53                        self.perform_fft_butterfly(DoubleBuf {
54                            input: in_chunk,
55                            output: out_chunk,
56                        })
57                    },
58                );
59            }
60            fn process_with_scratch(&self, buffer: &mut [Complex<T>], _scratch: &mut [Complex<T>]) {
61                crate::fft_helper::fft_helper_inplace(
62                    buffer,
63                    &mut [],
64                    self.len(),
65                    0,
66                    |chunk, _| unsafe { self.perform_fft_butterfly(chunk) },
67                );
68            }
69            #[inline(always)]
70            fn get_inplace_scratch_len(&self) -> usize {
71                0
72            }
73            #[inline(always)]
74            fn get_outofplace_scratch_len(&self) -> usize {
75                0
76            }
77            #[inline(always)]
78            fn get_immutable_scratch_len(&self) -> usize {
79                0
80            }
81        }
82        impl<T> Length for $struct_name<T> {
83            #[inline(always)]
84            fn len(&self) -> usize {
85                $len
86            }
87        }
88        impl<T> Direction for $struct_name<T> {
89            #[inline(always)]
90            fn fft_direction(&self) -> FftDirection {
91                $direction_fn(self)
92            }
93        }
94    };
95}
96
97pub struct Butterfly1<T> {
98    direction: FftDirection,
99    _phantom: std::marker::PhantomData<T>,
100}
101impl<T: FftNum> Butterfly1<T> {
102    #[inline(always)]
103    pub fn new(direction: FftDirection) -> Self {
104        Self {
105            direction,
106            _phantom: std::marker::PhantomData,
107        }
108    }
109}
110impl<T: FftNum> Fft<T> for Butterfly1<T> {
111    fn process_immutable_with_scratch(
112        &self,
113        input: &[Complex<T>],
114        output: &mut [Complex<T>],
115        _scratch: &mut [Complex<T>],
116    ) {
117        output.copy_from_slice(input);
118    }
119
120    fn process_outofplace_with_scratch(
121        &self,
122        input: &mut [Complex<T>],
123        output: &mut [Complex<T>],
124        _scratch: &mut [Complex<T>],
125    ) {
126        output.copy_from_slice(input);
127    }
128
129    fn process_with_scratch(&self, _buffer: &mut [Complex<T>], _scratch: &mut [Complex<T>]) {}
130
131    fn get_inplace_scratch_len(&self) -> usize {
132        0
133    }
134
135    fn get_outofplace_scratch_len(&self) -> usize {
136        0
137    }
138
139    fn get_immutable_scratch_len(&self) -> usize {
140        0
141    }
142}
143impl<T> Length for Butterfly1<T> {
144    fn len(&self) -> usize {
145        1
146    }
147}
148impl<T> Direction for Butterfly1<T> {
149    fn fft_direction(&self) -> FftDirection {
150        self.direction
151    }
152}
153
154pub struct Butterfly2<T> {
155    direction: FftDirection,
156    _phantom: std::marker::PhantomData<T>,
157}
158boilerplate_fft_butterfly!(Butterfly2, 2, |this: &Butterfly2<_>| this.direction);
159impl<T: FftNum> Butterfly2<T> {
160    #[inline(always)]
161    pub fn new(direction: FftDirection) -> Self {
162        Self {
163            direction,
164            _phantom: std::marker::PhantomData,
165        }
166    }
167    #[inline(always)]
168    unsafe fn perform_fft_strided(left: &mut Complex<T>, right: &mut Complex<T>) {
169        let temp = *left + *right;
170
171        *right = *left - *right;
172        *left = temp;
173    }
174    #[inline(always)]
175    unsafe fn perform_fft_contiguous(&self, mut buffer: impl LoadStore<T>) {
176        let value0 = buffer.load(0);
177        let value1 = buffer.load(1);
178        buffer.store(value0 + value1, 0);
179        buffer.store(value0 - value1, 1);
180    }
181}
182
183pub struct Butterfly3<T> {
184    pub twiddle: Complex<T>,
185    direction: FftDirection,
186}
187boilerplate_fft_butterfly!(Butterfly3, 3, |this: &Butterfly3<_>| this.direction);
188impl<T: FftNum> Butterfly3<T> {
189    #[inline(always)]
190    pub fn new(direction: FftDirection) -> Self {
191        Self {
192            twiddle: twiddles::compute_twiddle(1, 3, direction),
193            direction,
194        }
195    }
196    #[inline(always)]
197    pub fn direction_of(fft: &Butterfly3<T>) -> Self {
198        Self {
199            twiddle: fft.twiddle.conj(),
200            direction: fft.direction.opposite_direction(),
201        }
202    }
203    #[inline(always)]
204    unsafe fn perform_fft_strided(
205        &self,
206        val0: &mut Complex<T>,
207        val1: &mut Complex<T>,
208        val2: &mut Complex<T>,
209    ) {
210        let xp = *val1 + *val2;
211        let xn = *val1 - *val2;
212        let sum = *val0 + xp;
213
214        let temp_a = *val0
215            + Complex {
216                re: self.twiddle.re * xp.re,
217                im: self.twiddle.re * xp.im,
218            };
219        let temp_b = Complex {
220            re: -self.twiddle.im * xn.im,
221            im: self.twiddle.im * xn.re,
222        };
223
224        *val0 = sum;
225        *val1 = temp_a + temp_b;
226        *val2 = temp_a - temp_b;
227    }
228
229    #[inline(always)]
230    unsafe fn perform_fft_contiguous(&self, mut buffer: impl LoadStore<T>) {
231        let xp = buffer.load(1) + buffer.load(2);
232        let xn = buffer.load(1) - buffer.load(2);
233        let sum = buffer.load(0) + xp;
234
235        let temp_a = buffer.load(0)
236            + Complex {
237                re: self.twiddle.re * xp.re,
238                im: self.twiddle.re * xp.im,
239            };
240        let temp_b = Complex {
241            re: -self.twiddle.im * xn.im,
242            im: self.twiddle.im * xn.re,
243        };
244
245        buffer.store(sum, 0);
246        buffer.store(temp_a + temp_b, 1);
247        buffer.store(temp_a - temp_b, 2);
248    }
249}
250
251pub struct Butterfly4<T> {
252    direction: FftDirection,
253    _phantom: std::marker::PhantomData<T>,
254}
255boilerplate_fft_butterfly!(Butterfly4, 4, |this: &Butterfly4<_>| this.direction);
256impl<T: FftNum> Butterfly4<T> {
257    #[inline(always)]
258    pub fn new(direction: FftDirection) -> Self {
259        Self {
260            direction,
261            _phantom: std::marker::PhantomData,
262        }
263    }
264    #[inline(always)]
265    unsafe fn perform_fft_contiguous(&self, mut buffer: impl LoadStore<T>) {
266        //we're going to hardcode a step of mixed radix
267        //aka we're going to do the six step algorithm
268
269        // step 1: transpose, which we're skipping because we're just going to perform non-contiguous FFTs
270        let mut value0 = buffer.load(0);
271        let mut value1 = buffer.load(1);
272        let mut value2 = buffer.load(2);
273        let mut value3 = buffer.load(3);
274
275        // step 2: column FFTs
276        Butterfly2::perform_fft_strided(&mut value0, &mut value2);
277        Butterfly2::perform_fft_strided(&mut value1, &mut value3);
278
279        // step 3: apply twiddle factors (only one in this case, and it's either 0 + i or 0 - i)
280        value3 = twiddles::rotate_90(value3, self.direction);
281
282        // step 4: transpose, which we're skipping because we're the previous FFTs were non-contiguous
283
284        // step 5: row FFTs
285        Butterfly2::perform_fft_strided(&mut value0, &mut value1);
286        Butterfly2::perform_fft_strided(&mut value2, &mut value3);
287
288        // step 6: transpose by swapping index 1 and 2
289        buffer.store(value0, 0);
290        buffer.store(value2, 1);
291        buffer.store(value1, 2);
292        buffer.store(value3, 3);
293    }
294
295    #[inline(always)]
296    unsafe fn perform_fft_strided(
297        &self,
298        value0: &mut Complex<T>,
299        value1: &mut Complex<T>,
300        value2: &mut Complex<T>,
301        value3: &mut Complex<T>,
302    ) {
303        // step 2: column FFTs
304        Butterfly2::perform_fft_strided(value0, value2);
305        Butterfly2::perform_fft_strided(value1, value3);
306
307        // step 3: apply twiddle factors (only one in this case, and it's either 0 + i or 0 - i)
308        *value3 = twiddles::rotate_90(*value3, self.direction);
309
310        // step 4: transpose, which we're skipping because we're the previous FFTs were non-contiguous
311
312        // step 5: row FFTs
313        Butterfly2::perform_fft_strided(value0, value1);
314        Butterfly2::perform_fft_strided(value2, value3);
315
316        // step 6: transpose
317        let temp = *value1;
318        *value1 = *value2;
319        *value2 = temp;
320    }
321}
322
323pub struct Butterfly5<T> {
324    twiddle1: Complex<T>,
325    twiddle2: Complex<T>,
326    direction: FftDirection,
327}
328boilerplate_fft_butterfly!(Butterfly5, 5, |this: &Butterfly5<_>| this.direction);
329impl<T: FftNum> Butterfly5<T> {
330    pub fn new(direction: FftDirection) -> Self {
331        Self {
332            twiddle1: twiddles::compute_twiddle(1, 5, direction),
333            twiddle2: twiddles::compute_twiddle(2, 5, direction),
334            direction,
335        }
336    }
337
338    #[inline(never)] // refusing to inline this code reduces code size, and doesn't hurt performance
339    unsafe fn perform_fft_contiguous(&self, mut buffer: impl LoadStore<T>) {
340        // let mut outer = Butterfly2::perform_fft_array([buffer.load(1), buffer.load(4)]);
341        // let mut inner = Butterfly2::perform_fft_array([buffer.load(2), buffer.load(3)]);
342        // let input0 = buffer.load(0);
343
344        // buffer.store(input0 + outer[0] + inner[0], 0);
345
346        // inner[1] = twiddles::rotate_90(inner[1], true);
347        // outer[1] = twiddles::rotate_90(outer[1], true);
348
349        // {
350        //     let twiddled1 = outer[0] * self.twiddles[0].re;
351        //     let twiddled2 = inner[0] * self.twiddles[1].re;
352        //     let twiddled3 = inner[1] * self.twiddles[1].im;
353        //     let twiddled4 = outer[1] * self.twiddles[0].im;
354
355        //     let sum12 = twiddled1 + twiddled2;
356        //     let sum34 = twiddled4 + twiddled3;
357
358        //     let output1 = sum12 + sum34;
359        //     let output4 = sum12 - sum34;
360
361        //     buffer.store(input0 + output1, 1);
362        //     buffer.store(input0 + output4, 4);
363        // }
364
365        // {
366        //     let twiddled1 = outer[0] * self.twiddles[1].re;
367        //     let twiddled2 = inner[0] * self.twiddles[0].re;
368        //     let twiddled3 = inner[1] * self.twiddles[0].im;
369        //     let twiddled4 = outer[1] * self.twiddles[1].im;
370        // }
371
372        // Let's do a plain 5-point Dft
373        // |X0|   | W0 W0  W0  W0  W0  |   |x0|
374        // |X1|   | W0 W1  W2  W3  W4  |   |x1|
375        // |X2| = | W0 W2  W4  W6  W8  | * |x2|
376        // |X3|   | W0 W3  W6  W9  W12 |   |x3|
377        // |X4|   | W0 W4  W8  W12 W16 |   |x4|
378        //
379        // where Wn = exp(-2*pi*n/5) for a forward transform, and exp(+2*pi*n/5) for an direction.
380        //
381        // This can be simplified a bit since exp(-2*pi*n/5) = exp(-2*pi*n/5 + m*2*pi)
382        // |X0|   | W0 W0  W0  W0  W0 |   |x0|
383        // |X1|   | W0 W1  W2  W3  W4 |   |x1|
384        // |X2| = | W0 W2  W4  W1  W3 | * |x2|
385        // |X3|   | W0 W3  W1  W4  W2 |   |x3|
386        // |X4|   | W0 W4  W3  W2  W1 |   |x4|
387        //
388        // Next we can use the symmetry that W3 = W2* and W4 = W1* (where * means complex conjugate), and W0 = 1
389        // |X0|   | 1  1   1   1   1   |   |x0|
390        // |X1|   | 1  W1  W2  W2* W1* |   |x1|
391        // |X2| = | 1  W2  W1* W1  W2* | * |x2|
392        // |X3|   | 1  W2* W1  W1* W2  |   |x3|
393        // |X4|   | 1  W1* W2* W2  W1  |   |x4|
394        //
395        // Next, we write out the whole expression with real and imaginary parts.
396        // X0 = x0 + x1 + x2 + x3 + x4
397        // X1 = x0 + (W1.re + j*W1.im)*x1 + (W2.re + j*W2.im)*x2 + (W2.re - j*W2.im)*x3 + (W1.re - j*W1.im)*x4
398        // X2 = x0 + (W2.re + j*W2.im)*x1 + (W1.re - j*W1.im)*x2 + (W1.re + j*W1.im)*x3 + (W2.re - j*W2.im)*x4
399        // X3 = x0 + (W2.re - j*W2.im)*x1 + (W1.re + j*W1.im)*x2 + (W1.re - j*W1.im)*x3 + (W2.re + j*W2.im)*x4
400        // X4 = x0 + (W1.re - j*W1.im)*x1 + (W2.re - j*W2.im)*x2 + (W2.re + j*W2.im)*x3 + (W1.re + j*W1.im)*x4
401        //
402        // Then we rearrange and sort terms.
403        // X0 = x0 + x1 + x2 + x3 + x4
404        // X1 = x0 + W1.re*(x1+x4) + W2.re*(x2+x3) + j*(W1.im*(x1-x4) + W2.im*(x2-x3))
405        // X2 = x0 + W1.re*(x2+x3) + W2.re*(x1+x4) - j*(W1.im*(x2-x3) - W2.im*(x1-x4))
406        // X3 = x0 + W1.re*(x2+x3) + W2.re*(x1+x4) + j*(W1.im*(x2-x3) - W2.im*(x1-x4))
407        // X4 = x0 + W1.re*(x1+x4) + W2.re*(x2+x3) - j*(W1.im*(x1-x4) + W2.im*(x2-x3))
408        //
409        // Now we define x14p=x1+x4 x14n=x1-x4, x23p=x2+x3, x23n=x2-x3
410        // X0 = x0 + x1 + x2 + x3 + x4
411        // X1 = x0 + W1.re*(x14p) + W2.re*(x23p) + j*(W1.im*(x14n) + W2.im*(x23n))
412        // X2 = x0 + W1.re*(x23p) + W2.re*(x14p) - j*(W1.im*(x23n) - W2.im*(x14n))
413        // X3 = x0 + W1.re*(x23p) + W2.re*(x14p) + j*(W1.im*(x23n) - W2.im*(x14n))
414        // X4 = x0 + W1.re*(x14p) + W2.re*(x23p) - j*(W1.im*(x14n) + W2.im*(x23n))
415        //
416        // The final step is to write out real and imaginary parts of x14n etc, and replacing using j*j=-1
417        // After this it's easy to remove any repeated calculation of the same values.
418
419        let x14p = buffer.load(1) + buffer.load(4);
420        let x14n = buffer.load(1) - buffer.load(4);
421        let x23p = buffer.load(2) + buffer.load(3);
422        let x23n = buffer.load(2) - buffer.load(3);
423        let sum = buffer.load(0) + x14p + x23p;
424        let b14re_a = buffer.load(0).re + self.twiddle1.re * x14p.re + self.twiddle2.re * x23p.re;
425        let b14re_b = self.twiddle1.im * x14n.im + self.twiddle2.im * x23n.im;
426        let b23re_a = buffer.load(0).re + self.twiddle2.re * x14p.re + self.twiddle1.re * x23p.re;
427        let b23re_b = self.twiddle2.im * x14n.im + -self.twiddle1.im * x23n.im;
428
429        let b14im_a = buffer.load(0).im + self.twiddle1.re * x14p.im + self.twiddle2.re * x23p.im;
430        let b14im_b = self.twiddle1.im * x14n.re + self.twiddle2.im * x23n.re;
431        let b23im_a = buffer.load(0).im + self.twiddle2.re * x14p.im + self.twiddle1.re * x23p.im;
432        let b23im_b = self.twiddle2.im * x14n.re + -self.twiddle1.im * x23n.re;
433
434        let out1re = b14re_a - b14re_b;
435        let out1im = b14im_a + b14im_b;
436        let out2re = b23re_a - b23re_b;
437        let out2im = b23im_a + b23im_b;
438        let out3re = b23re_a + b23re_b;
439        let out3im = b23im_a - b23im_b;
440        let out4re = b14re_a + b14re_b;
441        let out4im = b14im_a - b14im_b;
442        buffer.store(sum, 0);
443        buffer.store(
444            Complex {
445                re: out1re,
446                im: out1im,
447            },
448            1,
449        );
450        buffer.store(
451            Complex {
452                re: out2re,
453                im: out2im,
454            },
455            2,
456        );
457        buffer.store(
458            Complex {
459                re: out3re,
460                im: out3im,
461            },
462            3,
463        );
464        buffer.store(
465            Complex {
466                re: out4re,
467                im: out4im,
468            },
469            4,
470        );
471    }
472}
473
474pub struct Butterfly6<T> {
475    butterfly3: Butterfly3<T>,
476}
477boilerplate_fft_butterfly!(Butterfly6, 6, |this: &Butterfly6<_>| this
478    .butterfly3
479    .fft_direction());
480impl<T: FftNum> Butterfly6<T> {
481    #[inline(always)]
482    pub fn new(direction: FftDirection) -> Self {
483        Self {
484            butterfly3: Butterfly3::new(direction),
485        }
486    }
487    #[inline(always)]
488    pub fn direction_of(fft: &Butterfly6<T>) -> Self {
489        Self {
490            butterfly3: Butterfly3::direction_of(&fft.butterfly3),
491        }
492    }
493    #[inline(always)]
494    unsafe fn perform_fft_contiguous(&self, mut buffer: impl LoadStore<T>) {
495        //since GCD(2,3) == 1 we're going to hardcode a step of the Good-Thomas algorithm to avoid twiddle factors
496
497        // step 1: reorder the input directly into the scratch. normally there's a whole thing to compute this ordering
498        //but thankfully we can just precompute it and hardcode it
499        let mut scratch_a = [buffer.load(0), buffer.load(2), buffer.load(4)];
500
501        let mut scratch_b = [buffer.load(3), buffer.load(5), buffer.load(1)];
502
503        // step 2: column FFTs
504        self.butterfly3.perform_fft_contiguous(&mut scratch_a);
505        self.butterfly3.perform_fft_contiguous(&mut scratch_b);
506
507        // step 3: apply twiddle factors -- SKIPPED because good-thomas doesn't have twiddle factors :)
508
509        // step 4: SKIPPED because the next FFTs will be non-contiguous
510
511        // step 5: row FFTs
512        Butterfly2::perform_fft_strided(&mut scratch_a[0], &mut scratch_b[0]);
513        Butterfly2::perform_fft_strided(&mut scratch_a[1], &mut scratch_b[1]);
514        Butterfly2::perform_fft_strided(&mut scratch_a[2], &mut scratch_b[2]);
515
516        // step 6: reorder the result back into the buffer. again we would normally have to do an expensive computation
517        // but instead we can precompute and hardcode the ordering
518        // note that we're also rolling a transpose step into this reorder
519        buffer.store(scratch_a[0], 0);
520        buffer.store(scratch_b[1], 1);
521        buffer.store(scratch_a[2], 2);
522        buffer.store(scratch_b[0], 3);
523        buffer.store(scratch_a[1], 4);
524        buffer.store(scratch_b[2], 5);
525    }
526}
527
528pub struct Butterfly7<T> {
529    twiddle1: Complex<T>,
530    twiddle2: Complex<T>,
531    twiddle3: Complex<T>,
532    direction: FftDirection,
533}
534boilerplate_fft_butterfly!(Butterfly7, 7, |this: &Butterfly7<_>| this.direction);
535impl<T: FftNum> Butterfly7<T> {
536    pub fn new(direction: FftDirection) -> Self {
537        Self {
538            twiddle1: twiddles::compute_twiddle(1, 7, direction),
539            twiddle2: twiddles::compute_twiddle(2, 7, direction),
540            twiddle3: twiddles::compute_twiddle(3, 7, direction),
541            direction,
542        }
543    }
544    #[inline(never)]
545    unsafe fn perform_fft_contiguous(&self, mut buffer: impl LoadStore<T>) {
546        // let mut outer = Butterfly2::perform_fft_array([buffer.load(1), buffer.load(6)]);
547        // let mut mid   = Butterfly2::perform_fft_array([buffer.load(2), buffer.load(5)]);
548        // let mut inner = Butterfly2::perform_fft_array([buffer.load(3), buffer.load(4)]);
549        // let input0 = buffer.load(0);
550
551        // buffer.store(input0 + outer[0] + mid[0] + inner[0], 0);
552
553        // inner[1] = twiddles::rotate_90(inner[1], true);
554        // mid[1]   = twiddles::rotate_90(mid[1],   true);
555        // outer[1] = twiddles::rotate_90(outer[1], true);
556
557        // {
558        //     let twiddled1 = outer[0] * self.twiddles[0].re;
559        //     let twiddled2 =   mid[0] * self.twiddles[1].re;
560        //     let twiddled3 = inner[0] * self.twiddles[2].re;
561        //     let twiddled4 = inner[1] * self.twiddles[2].im;
562        //     let twiddled5 =   mid[1] * self.twiddles[1].im;
563        //     let twiddled6 = outer[1] * self.twiddles[0].im;
564
565        //     let sum123 = twiddled1 + twiddled2 + twiddled3;
566        //     let sum456 = twiddled4 + twiddled5 + twiddled6;
567
568        //     let output1 = sum123 + sum456;
569        //     let output6 = sum123 - sum456;
570
571        //     buffer.store(input0 + output1, 1);
572        //     buffer.store(input0 + output6, 6);
573        // }
574
575        // {
576        //     let twiddled1 = outer[0] * self.twiddles[1].re;
577        //     let twiddled2 =   mid[0] * self.twiddles[2].re;
578        //     let twiddled3 = inner[0] * self.twiddles[0].re;
579        //     let twiddled4 = inner[1] * self.twiddles[0].im;
580        //     let twiddled5 =   mid[1] * self.twiddles[2].im;
581        //     let twiddled6 = outer[1] * self.twiddles[1].im;
582
583        //     let sum123 = twiddled1 + twiddled2 + twiddled3;
584        //     let sum456 = twiddled6 - twiddled4 - twiddled5;
585
586        //     let output2 = sum123 + sum456;
587        //     let output5 = sum123 - sum456;
588
589        //     buffer.store(input0 + output2, 2);
590        //     buffer.store(input0 + output5, 5);
591        // }
592
593        // Let's do a plain 7-point Dft
594        // |X0|   | W0 W0  W0  W0  W0  W0  W0  |   |x0|
595        // |X1|   | W0 W1  W2  W3  W4  W5  W6  |   |x1|
596        // |X2|   | W0 W2  W4  W6  W8  W10 W12 |   |x2|
597        // |X3| = | W0 W3  W6  W9  W12 W15 W18 | * |x3|
598        // |X4|   | W0 W4  W8  W12 W16 W20 W24 |   |x4|
599        // |X5|   | W0 W5  W10 W15 W20 W25 W30 |   |x4|
600        // |X6|   | W0 W6  W12 W18 W24 W30 W36 |   |x4|
601        //
602        // where Wn = exp(-2*pi*n/7) for a forward transform, and exp(+2*pi*n/7) for an direction.
603        //
604        // Using the same logic as for the 5-point butterfly, this can be simplified to:
605        // |X0|   | 1  1   1   1   1   1   1   |   |x0|
606        // |X1|   | 1  W1  W2  W3  W3* W2* W1* |   |x1|
607        // |X2|   | 1  W2  W3* W1* W1  W3  W2* |   |x2|
608        // |X3| = | 1  W3  W1* W2  W2* W1  W3* | * |x3|
609        // |X4|   | 1  W3* W1  W2* W2  W1* W3  |   |x4|
610        // |X5|   | 1  W2* W3  W1  W1* W3* W2  |   |x5|
611        // |X6|   | 1  W1* W2* W3* W3  W2  W1  |   |x6|
612        //
613        // From here it's just about eliminating repeated calculations, following the same procedure as for the 5-point butterfly.
614
615        let x16p = buffer.load(1) + buffer.load(6);
616        let x16n = buffer.load(1) - buffer.load(6);
617        let x25p = buffer.load(2) + buffer.load(5);
618        let x25n = buffer.load(2) - buffer.load(5);
619        let x34p = buffer.load(3) + buffer.load(4);
620        let x34n = buffer.load(3) - buffer.load(4);
621        let sum = buffer.load(0) + x16p + x25p + x34p;
622
623        let x16re_a = buffer.load(0).re
624            + self.twiddle1.re * x16p.re
625            + self.twiddle2.re * x25p.re
626            + self.twiddle3.re * x34p.re;
627        let x16re_b =
628            self.twiddle1.im * x16n.im + self.twiddle2.im * x25n.im + self.twiddle3.im * x34n.im;
629        let x25re_a = buffer.load(0).re
630            + self.twiddle1.re * x34p.re
631            + self.twiddle2.re * x16p.re
632            + self.twiddle3.re * x25p.re;
633        let x25re_b =
634            -self.twiddle1.im * x34n.im + self.twiddle2.im * x16n.im - self.twiddle3.im * x25n.im;
635        let x34re_a = buffer.load(0).re
636            + self.twiddle1.re * x25p.re
637            + self.twiddle2.re * x34p.re
638            + self.twiddle3.re * x16p.re;
639        let x34re_b =
640            -self.twiddle1.im * x25n.im + self.twiddle2.im * x34n.im + self.twiddle3.im * x16n.im;
641        let x16im_a = buffer.load(0).im
642            + self.twiddle1.re * x16p.im
643            + self.twiddle2.re * x25p.im
644            + self.twiddle3.re * x34p.im;
645        let x16im_b =
646            self.twiddle1.im * x16n.re + self.twiddle2.im * x25n.re + self.twiddle3.im * x34n.re;
647        let x25im_a = buffer.load(0).im
648            + self.twiddle1.re * x34p.im
649            + self.twiddle2.re * x16p.im
650            + self.twiddle3.re * x25p.im;
651        let x25im_b =
652            -self.twiddle1.im * x34n.re + self.twiddle2.im * x16n.re - self.twiddle3.im * x25n.re;
653        let x34im_a = buffer.load(0).im
654            + self.twiddle1.re * x25p.im
655            + self.twiddle2.re * x34p.im
656            + self.twiddle3.re * x16p.im;
657        let x34im_b =
658            self.twiddle1.im * x25n.re - self.twiddle2.im * x34n.re - self.twiddle3.im * x16n.re;
659
660        let out1re = x16re_a - x16re_b;
661        let out1im = x16im_a + x16im_b;
662        let out2re = x25re_a - x25re_b;
663        let out2im = x25im_a + x25im_b;
664        let out3re = x34re_a - x34re_b;
665        let out3im = x34im_a - x34im_b;
666        let out4re = x34re_a + x34re_b;
667        let out4im = x34im_a + x34im_b;
668        let out5re = x25re_a + x25re_b;
669        let out5im = x25im_a - x25im_b;
670        let out6re = x16re_a + x16re_b;
671        let out6im = x16im_a - x16im_b;
672
673        buffer.store(sum, 0);
674        buffer.store(
675            Complex {
676                re: out1re,
677                im: out1im,
678            },
679            1,
680        );
681        buffer.store(
682            Complex {
683                re: out2re,
684                im: out2im,
685            },
686            2,
687        );
688        buffer.store(
689            Complex {
690                re: out3re,
691                im: out3im,
692            },
693            3,
694        );
695        buffer.store(
696            Complex {
697                re: out4re,
698                im: out4im,
699            },
700            4,
701        );
702        buffer.store(
703            Complex {
704                re: out5re,
705                im: out5im,
706            },
707            5,
708        );
709        buffer.store(
710            Complex {
711                re: out6re,
712                im: out6im,
713            },
714            6,
715        );
716    }
717}
718
719pub struct Butterfly8<T> {
720    root2: T,
721    direction: FftDirection,
722}
723boilerplate_fft_butterfly!(Butterfly8, 8, |this: &Butterfly8<_>| this.direction);
724impl<T: FftNum> Butterfly8<T> {
725    #[inline(always)]
726    pub fn new(direction: FftDirection) -> Self {
727        Self {
728            root2: T::from_f64(0.5f64.sqrt()).unwrap(),
729            direction,
730        }
731    }
732
733    #[inline(always)]
734    unsafe fn perform_fft_contiguous(&self, mut buffer: impl LoadStore<T>) {
735        let butterfly4 = Butterfly4::new(self.direction);
736
737        //we're going to hardcode a step of mixed radix
738        //aka we're going to do the six step algorithm
739
740        // step 1: transpose the input into the scratch
741        let mut scratch0 = [
742            buffer.load(0),
743            buffer.load(2),
744            buffer.load(4),
745            buffer.load(6),
746        ];
747        let mut scratch1 = [
748            buffer.load(1),
749            buffer.load(3),
750            buffer.load(5),
751            buffer.load(7),
752        ];
753
754        // step 2: column FFTs
755        butterfly4.perform_fft_contiguous(&mut scratch0);
756        butterfly4.perform_fft_contiguous(&mut scratch1);
757
758        // step 3: apply twiddle factors
759        scratch1[1] = (twiddles::rotate_90(scratch1[1], self.direction) + scratch1[1]) * self.root2;
760        scratch1[2] = twiddles::rotate_90(scratch1[2], self.direction);
761        scratch1[3] = (twiddles::rotate_90(scratch1[3], self.direction) - scratch1[3]) * self.root2;
762
763        // step 4: transpose -- skipped because we're going to do the next FFTs non-contiguously
764
765        // step 5: row FFTs
766        for i in 0..4 {
767            Butterfly2::perform_fft_strided(&mut scratch0[i], &mut scratch1[i]);
768        }
769
770        // step 6: copy data to the output. we don't need to transpose, because we skipped the step 4 transpose
771        for i in 0..4 {
772            buffer.store(scratch0[i], i);
773        }
774        for i in 0..4 {
775            buffer.store(scratch1[i], i + 4);
776        }
777    }
778}
779
780pub struct Butterfly9<T> {
781    butterfly3: Butterfly3<T>,
782    twiddle1: Complex<T>,
783    twiddle2: Complex<T>,
784    twiddle4: Complex<T>,
785}
786boilerplate_fft_butterfly!(Butterfly9, 9, |this: &Butterfly9<_>| this
787    .butterfly3
788    .fft_direction());
789impl<T: FftNum> Butterfly9<T> {
790    #[inline(always)]
791    pub fn new(direction: FftDirection) -> Self {
792        Self {
793            butterfly3: Butterfly3::new(direction),
794            twiddle1: twiddles::compute_twiddle(1, 9, direction),
795            twiddle2: twiddles::compute_twiddle(2, 9, direction),
796            twiddle4: twiddles::compute_twiddle(4, 9, direction),
797        }
798    }
799    #[inline(always)]
800    unsafe fn perform_fft_contiguous(&self, mut buffer: impl LoadStore<T>) {
801        // algorithm: mixed radix with width=3 and height=3
802
803        // step 1: transpose the input into the scratch
804        let mut scratch0 = [buffer.load(0), buffer.load(3), buffer.load(6)];
805        let mut scratch1 = [buffer.load(1), buffer.load(4), buffer.load(7)];
806        let mut scratch2 = [buffer.load(2), buffer.load(5), buffer.load(8)];
807
808        // step 2: column FFTs
809        self.butterfly3.perform_fft_contiguous(&mut scratch0);
810        self.butterfly3.perform_fft_contiguous(&mut scratch1);
811        self.butterfly3.perform_fft_contiguous(&mut scratch2);
812
813        // step 3: apply twiddle factors
814        scratch1[1] = scratch1[1] * self.twiddle1;
815        scratch1[2] = scratch1[2] * self.twiddle2;
816        scratch2[1] = scratch2[1] * self.twiddle2;
817        scratch2[2] = scratch2[2] * self.twiddle4;
818
819        // step 4: SKIPPED because the next FFTs will be non-contiguous
820
821        // step 5: row FFTs
822        self.butterfly3
823            .perform_fft_strided(&mut scratch0[0], &mut scratch1[0], &mut scratch2[0]);
824        self.butterfly3
825            .perform_fft_strided(&mut scratch0[1], &mut scratch1[1], &mut scratch2[1]);
826        self.butterfly3
827            .perform_fft_strided(&mut scratch0[2], &mut scratch1[2], &mut scratch2[2]);
828
829        // step 6: copy the result into the output. normally we'd need to do a transpose here, but we can skip it because we skipped the transpose in step 4
830        buffer.store(scratch0[0], 0);
831        buffer.store(scratch0[1], 1);
832        buffer.store(scratch0[2], 2);
833        buffer.store(scratch1[0], 3);
834        buffer.store(scratch1[1], 4);
835        buffer.store(scratch1[2], 5);
836        buffer.store(scratch2[0], 6);
837        buffer.store(scratch2[1], 7);
838        buffer.store(scratch2[2], 8);
839    }
840}
841
842pub struct Butterfly11<T> {
843    twiddle1: Complex<T>,
844    twiddle2: Complex<T>,
845    twiddle3: Complex<T>,
846    twiddle4: Complex<T>,
847    twiddle5: Complex<T>,
848    direction: FftDirection,
849}
850boilerplate_fft_butterfly!(Butterfly11, 11, |this: &Butterfly11<_>| this.direction);
851impl<T: FftNum> Butterfly11<T> {
852    pub fn new(direction: FftDirection) -> Self {
853        let twiddle1: Complex<T> = twiddles::compute_twiddle(1, 11, direction);
854        let twiddle2: Complex<T> = twiddles::compute_twiddle(2, 11, direction);
855        let twiddle3: Complex<T> = twiddles::compute_twiddle(3, 11, direction);
856        let twiddle4: Complex<T> = twiddles::compute_twiddle(4, 11, direction);
857        let twiddle5: Complex<T> = twiddles::compute_twiddle(5, 11, direction);
858        Self {
859            twiddle1,
860            twiddle2,
861            twiddle3,
862            twiddle4,
863            twiddle5,
864            direction,
865        }
866    }
867
868    #[inline(never)]
869    unsafe fn perform_fft_contiguous(&self, mut buffer: impl LoadStore<T>) {
870        // This function was derived in the same manner as the butterflies for length 3, 5 and 7.
871        // However, instead of doing it by hand the actual code is autogenerated
872        // with the `genbutterflies.py` script in the `tools` directory.
873
874        let x110p = buffer.load(1) + buffer.load(10);
875        let x110n = buffer.load(1) - buffer.load(10);
876        let x29p = buffer.load(2) + buffer.load(9);
877        let x29n = buffer.load(2) - buffer.load(9);
878        let x38p = buffer.load(3) + buffer.load(8);
879        let x38n = buffer.load(3) - buffer.load(8);
880        let x47p = buffer.load(4) + buffer.load(7);
881        let x47n = buffer.load(4) - buffer.load(7);
882        let x56p = buffer.load(5) + buffer.load(6);
883        let x56n = buffer.load(5) - buffer.load(6);
884        let sum = buffer.load(0) + x110p + x29p + x38p + x47p + x56p;
885        let b110re_a = buffer.load(0).re
886            + self.twiddle1.re * x110p.re
887            + self.twiddle2.re * x29p.re
888            + self.twiddle3.re * x38p.re
889            + self.twiddle4.re * x47p.re
890            + self.twiddle5.re * x56p.re;
891        let b110re_b = self.twiddle1.im * x110n.im
892            + self.twiddle2.im * x29n.im
893            + self.twiddle3.im * x38n.im
894            + self.twiddle4.im * x47n.im
895            + self.twiddle5.im * x56n.im;
896        let b29re_a = buffer.load(0).re
897            + self.twiddle2.re * x110p.re
898            + self.twiddle4.re * x29p.re
899            + self.twiddle5.re * x38p.re
900            + self.twiddle3.re * x47p.re
901            + self.twiddle1.re * x56p.re;
902        let b29re_b = self.twiddle2.im * x110n.im
903            + self.twiddle4.im * x29n.im
904            + -self.twiddle5.im * x38n.im
905            + -self.twiddle3.im * x47n.im
906            + -self.twiddle1.im * x56n.im;
907        let b38re_a = buffer.load(0).re
908            + self.twiddle3.re * x110p.re
909            + self.twiddle5.re * x29p.re
910            + self.twiddle2.re * x38p.re
911            + self.twiddle1.re * x47p.re
912            + self.twiddle4.re * x56p.re;
913        let b38re_b = self.twiddle3.im * x110n.im
914            + -self.twiddle5.im * x29n.im
915            + -self.twiddle2.im * x38n.im
916            + self.twiddle1.im * x47n.im
917            + self.twiddle4.im * x56n.im;
918        let b47re_a = buffer.load(0).re
919            + self.twiddle4.re * x110p.re
920            + self.twiddle3.re * x29p.re
921            + self.twiddle1.re * x38p.re
922            + self.twiddle5.re * x47p.re
923            + self.twiddle2.re * x56p.re;
924        let b47re_b = self.twiddle4.im * x110n.im
925            + -self.twiddle3.im * x29n.im
926            + self.twiddle1.im * x38n.im
927            + self.twiddle5.im * x47n.im
928            + -self.twiddle2.im * x56n.im;
929        let b56re_a = buffer.load(0).re
930            + self.twiddle5.re * x110p.re
931            + self.twiddle1.re * x29p.re
932            + self.twiddle4.re * x38p.re
933            + self.twiddle2.re * x47p.re
934            + self.twiddle3.re * x56p.re;
935        let b56re_b = self.twiddle5.im * x110n.im
936            + -self.twiddle1.im * x29n.im
937            + self.twiddle4.im * x38n.im
938            + -self.twiddle2.im * x47n.im
939            + self.twiddle3.im * x56n.im;
940
941        let b110im_a = buffer.load(0).im
942            + self.twiddle1.re * x110p.im
943            + self.twiddle2.re * x29p.im
944            + self.twiddle3.re * x38p.im
945            + self.twiddle4.re * x47p.im
946            + self.twiddle5.re * x56p.im;
947        let b110im_b = self.twiddle1.im * x110n.re
948            + self.twiddle2.im * x29n.re
949            + self.twiddle3.im * x38n.re
950            + self.twiddle4.im * x47n.re
951            + self.twiddle5.im * x56n.re;
952        let b29im_a = buffer.load(0).im
953            + self.twiddle2.re * x110p.im
954            + self.twiddle4.re * x29p.im
955            + self.twiddle5.re * x38p.im
956            + self.twiddle3.re * x47p.im
957            + self.twiddle1.re * x56p.im;
958        let b29im_b = self.twiddle2.im * x110n.re
959            + self.twiddle4.im * x29n.re
960            + -self.twiddle5.im * x38n.re
961            + -self.twiddle3.im * x47n.re
962            + -self.twiddle1.im * x56n.re;
963        let b38im_a = buffer.load(0).im
964            + self.twiddle3.re * x110p.im
965            + self.twiddle5.re * x29p.im
966            + self.twiddle2.re * x38p.im
967            + self.twiddle1.re * x47p.im
968            + self.twiddle4.re * x56p.im;
969        let b38im_b = self.twiddle3.im * x110n.re
970            + -self.twiddle5.im * x29n.re
971            + -self.twiddle2.im * x38n.re
972            + self.twiddle1.im * x47n.re
973            + self.twiddle4.im * x56n.re;
974        let b47im_a = buffer.load(0).im
975            + self.twiddle4.re * x110p.im
976            + self.twiddle3.re * x29p.im
977            + self.twiddle1.re * x38p.im
978            + self.twiddle5.re * x47p.im
979            + self.twiddle2.re * x56p.im;
980        let b47im_b = self.twiddle4.im * x110n.re
981            + -self.twiddle3.im * x29n.re
982            + self.twiddle1.im * x38n.re
983            + self.twiddle5.im * x47n.re
984            + -self.twiddle2.im * x56n.re;
985        let b56im_a = buffer.load(0).im
986            + self.twiddle5.re * x110p.im
987            + self.twiddle1.re * x29p.im
988            + self.twiddle4.re * x38p.im
989            + self.twiddle2.re * x47p.im
990            + self.twiddle3.re * x56p.im;
991        let b56im_b = self.twiddle5.im * x110n.re
992            + -self.twiddle1.im * x29n.re
993            + self.twiddle4.im * x38n.re
994            + -self.twiddle2.im * x47n.re
995            + self.twiddle3.im * x56n.re;
996
997        let out1re = b110re_a - b110re_b;
998        let out1im = b110im_a + b110im_b;
999        let out2re = b29re_a - b29re_b;
1000        let out2im = b29im_a + b29im_b;
1001        let out3re = b38re_a - b38re_b;
1002        let out3im = b38im_a + b38im_b;
1003        let out4re = b47re_a - b47re_b;
1004        let out4im = b47im_a + b47im_b;
1005        let out5re = b56re_a - b56re_b;
1006        let out5im = b56im_a + b56im_b;
1007        let out6re = b56re_a + b56re_b;
1008        let out6im = b56im_a - b56im_b;
1009        let out7re = b47re_a + b47re_b;
1010        let out7im = b47im_a - b47im_b;
1011        let out8re = b38re_a + b38re_b;
1012        let out8im = b38im_a - b38im_b;
1013        let out9re = b29re_a + b29re_b;
1014        let out9im = b29im_a - b29im_b;
1015        let out10re = b110re_a + b110re_b;
1016        let out10im = b110im_a - b110im_b;
1017        buffer.store(sum, 0);
1018        buffer.store(
1019            Complex {
1020                re: out1re,
1021                im: out1im,
1022            },
1023            1,
1024        );
1025        buffer.store(
1026            Complex {
1027                re: out2re,
1028                im: out2im,
1029            },
1030            2,
1031        );
1032        buffer.store(
1033            Complex {
1034                re: out3re,
1035                im: out3im,
1036            },
1037            3,
1038        );
1039        buffer.store(
1040            Complex {
1041                re: out4re,
1042                im: out4im,
1043            },
1044            4,
1045        );
1046        buffer.store(
1047            Complex {
1048                re: out5re,
1049                im: out5im,
1050            },
1051            5,
1052        );
1053        buffer.store(
1054            Complex {
1055                re: out6re,
1056                im: out6im,
1057            },
1058            6,
1059        );
1060        buffer.store(
1061            Complex {
1062                re: out7re,
1063                im: out7im,
1064            },
1065            7,
1066        );
1067        buffer.store(
1068            Complex {
1069                re: out8re,
1070                im: out8im,
1071            },
1072            8,
1073        );
1074        buffer.store(
1075            Complex {
1076                re: out9re,
1077                im: out9im,
1078            },
1079            9,
1080        );
1081        buffer.store(
1082            Complex {
1083                re: out10re,
1084                im: out10im,
1085            },
1086            10,
1087        );
1088    }
1089}
1090
1091pub struct Butterfly12<T> {
1092    butterfly3: Butterfly3<T>,
1093    butterfly4: Butterfly4<T>,
1094}
1095boilerplate_fft_butterfly!(Butterfly12, 12, |this: &Butterfly12<_>| this
1096    .butterfly3
1097    .fft_direction());
1098impl<T: FftNum> Butterfly12<T> {
1099    #[inline(always)]
1100    pub fn new(direction: FftDirection) -> Self {
1101        Self {
1102            butterfly3: Butterfly3::new(direction),
1103            butterfly4: Butterfly4::new(direction),
1104        }
1105    }
1106    #[inline(always)]
1107    unsafe fn perform_fft_contiguous(&self, mut buffer: impl LoadStore<T>) {
1108        //since GCD(4,3) == 1 we're going to hardcode a step of the Good-Thomas algorithm to avoid twiddle factors
1109
1110        // step 1: reorder the input directly into the scratch. normally there's a whole thing to compute this ordering
1111        //but thankfully we can just precompute it and hardcode it
1112        let mut scratch0 = [
1113            buffer.load(0),
1114            buffer.load(3),
1115            buffer.load(6),
1116            buffer.load(9),
1117        ];
1118        let mut scratch1 = [
1119            buffer.load(4),
1120            buffer.load(7),
1121            buffer.load(10),
1122            buffer.load(1),
1123        ];
1124        let mut scratch2 = [
1125            buffer.load(8),
1126            buffer.load(11),
1127            buffer.load(2),
1128            buffer.load(5),
1129        ];
1130
1131        // step 2: column FFTs
1132        self.butterfly4.perform_fft_contiguous(&mut scratch0);
1133        self.butterfly4.perform_fft_contiguous(&mut scratch1);
1134        self.butterfly4.perform_fft_contiguous(&mut scratch2);
1135
1136        // step 3: apply twiddle factors -- SKIPPED because good-thomas doesn't have twiddle factors :)
1137
1138        // step 4: SKIPPED because the next FFTs will be non-contiguous
1139
1140        // step 5: row FFTs
1141        self.butterfly3
1142            .perform_fft_strided(&mut scratch0[0], &mut scratch1[0], &mut scratch2[0]);
1143        self.butterfly3
1144            .perform_fft_strided(&mut scratch0[1], &mut scratch1[1], &mut scratch2[1]);
1145        self.butterfly3
1146            .perform_fft_strided(&mut scratch0[2], &mut scratch1[2], &mut scratch2[2]);
1147        self.butterfly3
1148            .perform_fft_strided(&mut scratch0[3], &mut scratch1[3], &mut scratch2[3]);
1149
1150        // step 6: reorder the result back into the buffer. again we would normally have to do an expensive computation
1151        // but instead we can precompute and hardcode the ordering
1152        // note that we're also rolling a transpose step into this reorder
1153        buffer.store(scratch0[0], 0);
1154        buffer.store(scratch1[1], 1);
1155        buffer.store(scratch2[2], 2);
1156        buffer.store(scratch0[3], 3);
1157        buffer.store(scratch1[0], 4);
1158        buffer.store(scratch2[1], 5);
1159        buffer.store(scratch0[2], 6);
1160        buffer.store(scratch1[3], 7);
1161        buffer.store(scratch2[0], 8);
1162        buffer.store(scratch0[1], 9);
1163        buffer.store(scratch1[2], 10);
1164        buffer.store(scratch2[3], 11);
1165    }
1166}
1167
1168pub struct Butterfly13<T> {
1169    twiddle1: Complex<T>,
1170    twiddle2: Complex<T>,
1171    twiddle3: Complex<T>,
1172    twiddle4: Complex<T>,
1173    twiddle5: Complex<T>,
1174    twiddle6: Complex<T>,
1175    direction: FftDirection,
1176}
1177boilerplate_fft_butterfly!(Butterfly13, 13, |this: &Butterfly13<_>| this.direction);
1178impl<T: FftNum> Butterfly13<T> {
1179    pub fn new(direction: FftDirection) -> Self {
1180        let twiddle1: Complex<T> = twiddles::compute_twiddle(1, 13, direction);
1181        let twiddle2: Complex<T> = twiddles::compute_twiddle(2, 13, direction);
1182        let twiddle3: Complex<T> = twiddles::compute_twiddle(3, 13, direction);
1183        let twiddle4: Complex<T> = twiddles::compute_twiddle(4, 13, direction);
1184        let twiddle5: Complex<T> = twiddles::compute_twiddle(5, 13, direction);
1185        let twiddle6: Complex<T> = twiddles::compute_twiddle(6, 13, direction);
1186        Self {
1187            twiddle1,
1188            twiddle2,
1189            twiddle3,
1190            twiddle4,
1191            twiddle5,
1192            twiddle6,
1193            direction,
1194        }
1195    }
1196
1197    #[inline(never)]
1198    unsafe fn perform_fft_contiguous(&self, mut buffer: impl LoadStore<T>) {
1199        // This function was derived in the same manner as the butterflies for length 3, 5 and 7.
1200        // However, instead of doing it by hand the actual code is autogenerated
1201        // with the `genbutterflies.py` script in the `tools` directory.
1202        let x112p = buffer.load(1) + buffer.load(12);
1203        let x112n = buffer.load(1) - buffer.load(12);
1204        let x211p = buffer.load(2) + buffer.load(11);
1205        let x211n = buffer.load(2) - buffer.load(11);
1206        let x310p = buffer.load(3) + buffer.load(10);
1207        let x310n = buffer.load(3) - buffer.load(10);
1208        let x49p = buffer.load(4) + buffer.load(9);
1209        let x49n = buffer.load(4) - buffer.load(9);
1210        let x58p = buffer.load(5) + buffer.load(8);
1211        let x58n = buffer.load(5) - buffer.load(8);
1212        let x67p = buffer.load(6) + buffer.load(7);
1213        let x67n = buffer.load(6) - buffer.load(7);
1214        let sum = buffer.load(0) + x112p + x211p + x310p + x49p + x58p + x67p;
1215        let b112re_a = buffer.load(0).re
1216            + self.twiddle1.re * x112p.re
1217            + self.twiddle2.re * x211p.re
1218            + self.twiddle3.re * x310p.re
1219            + self.twiddle4.re * x49p.re
1220            + self.twiddle5.re * x58p.re
1221            + self.twiddle6.re * x67p.re;
1222        let b112re_b = self.twiddle1.im * x112n.im
1223            + self.twiddle2.im * x211n.im
1224            + self.twiddle3.im * x310n.im
1225            + self.twiddle4.im * x49n.im
1226            + self.twiddle5.im * x58n.im
1227            + self.twiddle6.im * x67n.im;
1228        let b211re_a = buffer.load(0).re
1229            + self.twiddle2.re * x112p.re
1230            + self.twiddle4.re * x211p.re
1231            + self.twiddle6.re * x310p.re
1232            + self.twiddle5.re * x49p.re
1233            + self.twiddle3.re * x58p.re
1234            + self.twiddle1.re * x67p.re;
1235        let b211re_b = self.twiddle2.im * x112n.im
1236            + self.twiddle4.im * x211n.im
1237            + self.twiddle6.im * x310n.im
1238            + -self.twiddle5.im * x49n.im
1239            + -self.twiddle3.im * x58n.im
1240            + -self.twiddle1.im * x67n.im;
1241        let b310re_a = buffer.load(0).re
1242            + self.twiddle3.re * x112p.re
1243            + self.twiddle6.re * x211p.re
1244            + self.twiddle4.re * x310p.re
1245            + self.twiddle1.re * x49p.re
1246            + self.twiddle2.re * x58p.re
1247            + self.twiddle5.re * x67p.re;
1248        let b310re_b = self.twiddle3.im * x112n.im
1249            + self.twiddle6.im * x211n.im
1250            + -self.twiddle4.im * x310n.im
1251            + -self.twiddle1.im * x49n.im
1252            + self.twiddle2.im * x58n.im
1253            + self.twiddle5.im * x67n.im;
1254        let b49re_a = buffer.load(0).re
1255            + self.twiddle4.re * x112p.re
1256            + self.twiddle5.re * x211p.re
1257            + self.twiddle1.re * x310p.re
1258            + self.twiddle3.re * x49p.re
1259            + self.twiddle6.re * x58p.re
1260            + self.twiddle2.re * x67p.re;
1261        let b49re_b = self.twiddle4.im * x112n.im
1262            + -self.twiddle5.im * x211n.im
1263            + -self.twiddle1.im * x310n.im
1264            + self.twiddle3.im * x49n.im
1265            + -self.twiddle6.im * x58n.im
1266            + -self.twiddle2.im * x67n.im;
1267        let b58re_a = buffer.load(0).re
1268            + self.twiddle5.re * x112p.re
1269            + self.twiddle3.re * x211p.re
1270            + self.twiddle2.re * x310p.re
1271            + self.twiddle6.re * x49p.re
1272            + self.twiddle1.re * x58p.re
1273            + self.twiddle4.re * x67p.re;
1274        let b58re_b = self.twiddle5.im * x112n.im
1275            + -self.twiddle3.im * x211n.im
1276            + self.twiddle2.im * x310n.im
1277            + -self.twiddle6.im * x49n.im
1278            + -self.twiddle1.im * x58n.im
1279            + self.twiddle4.im * x67n.im;
1280        let b67re_a = buffer.load(0).re
1281            + self.twiddle6.re * x112p.re
1282            + self.twiddle1.re * x211p.re
1283            + self.twiddle5.re * x310p.re
1284            + self.twiddle2.re * x49p.re
1285            + self.twiddle4.re * x58p.re
1286            + self.twiddle3.re * x67p.re;
1287        let b67re_b = self.twiddle6.im * x112n.im
1288            + -self.twiddle1.im * x211n.im
1289            + self.twiddle5.im * x310n.im
1290            + -self.twiddle2.im * x49n.im
1291            + self.twiddle4.im * x58n.im
1292            + -self.twiddle3.im * x67n.im;
1293
1294        let b112im_a = buffer.load(0).im
1295            + self.twiddle1.re * x112p.im
1296            + self.twiddle2.re * x211p.im
1297            + self.twiddle3.re * x310p.im
1298            + self.twiddle4.re * x49p.im
1299            + self.twiddle5.re * x58p.im
1300            + self.twiddle6.re * x67p.im;
1301        let b112im_b = self.twiddle1.im * x112n.re
1302            + self.twiddle2.im * x211n.re
1303            + self.twiddle3.im * x310n.re
1304            + self.twiddle4.im * x49n.re
1305            + self.twiddle5.im * x58n.re
1306            + self.twiddle6.im * x67n.re;
1307        let b211im_a = buffer.load(0).im
1308            + self.twiddle2.re * x112p.im
1309            + self.twiddle4.re * x211p.im
1310            + self.twiddle6.re * x310p.im
1311            + self.twiddle5.re * x49p.im
1312            + self.twiddle3.re * x58p.im
1313            + self.twiddle1.re * x67p.im;
1314        let b211im_b = self.twiddle2.im * x112n.re
1315            + self.twiddle4.im * x211n.re
1316            + self.twiddle6.im * x310n.re
1317            + -self.twiddle5.im * x49n.re
1318            + -self.twiddle3.im * x58n.re
1319            + -self.twiddle1.im * x67n.re;
1320        let b310im_a = buffer.load(0).im
1321            + self.twiddle3.re * x112p.im
1322            + self.twiddle6.re * x211p.im
1323            + self.twiddle4.re * x310p.im
1324            + self.twiddle1.re * x49p.im
1325            + self.twiddle2.re * x58p.im
1326            + self.twiddle5.re * x67p.im;
1327        let b310im_b = self.twiddle3.im * x112n.re
1328            + self.twiddle6.im * x211n.re
1329            + -self.twiddle4.im * x310n.re
1330            + -self.twiddle1.im * x49n.re
1331            + self.twiddle2.im * x58n.re
1332            + self.twiddle5.im * x67n.re;
1333        let b49im_a = buffer.load(0).im
1334            + self.twiddle4.re * x112p.im
1335            + self.twiddle5.re * x211p.im
1336            + self.twiddle1.re * x310p.im
1337            + self.twiddle3.re * x49p.im
1338            + self.twiddle6.re * x58p.im
1339            + self.twiddle2.re * x67p.im;
1340        let b49im_b = self.twiddle4.im * x112n.re
1341            + -self.twiddle5.im * x211n.re
1342            + -self.twiddle1.im * x310n.re
1343            + self.twiddle3.im * x49n.re
1344            + -self.twiddle6.im * x58n.re
1345            + -self.twiddle2.im * x67n.re;
1346        let b58im_a = buffer.load(0).im
1347            + self.twiddle5.re * x112p.im
1348            + self.twiddle3.re * x211p.im
1349            + self.twiddle2.re * x310p.im
1350            + self.twiddle6.re * x49p.im
1351            + self.twiddle1.re * x58p.im
1352            + self.twiddle4.re * x67p.im;
1353        let b58im_b = self.twiddle5.im * x112n.re
1354            + -self.twiddle3.im * x211n.re
1355            + self.twiddle2.im * x310n.re
1356            + -self.twiddle6.im * x49n.re
1357            + -self.twiddle1.im * x58n.re
1358            + self.twiddle4.im * x67n.re;
1359        let b67im_a = buffer.load(0).im
1360            + self.twiddle6.re * x112p.im
1361            + self.twiddle1.re * x211p.im
1362            + self.twiddle5.re * x310p.im
1363            + self.twiddle2.re * x49p.im
1364            + self.twiddle4.re * x58p.im
1365            + self.twiddle3.re * x67p.im;
1366        let b67im_b = self.twiddle6.im * x112n.re
1367            + -self.twiddle1.im * x211n.re
1368            + self.twiddle5.im * x310n.re
1369            + -self.twiddle2.im * x49n.re
1370            + self.twiddle4.im * x58n.re
1371            + -self.twiddle3.im * x67n.re;
1372
1373        let out1re = b112re_a - b112re_b;
1374        let out1im = b112im_a + b112im_b;
1375        let out2re = b211re_a - b211re_b;
1376        let out2im = b211im_a + b211im_b;
1377        let out3re = b310re_a - b310re_b;
1378        let out3im = b310im_a + b310im_b;
1379        let out4re = b49re_a - b49re_b;
1380        let out4im = b49im_a + b49im_b;
1381        let out5re = b58re_a - b58re_b;
1382        let out5im = b58im_a + b58im_b;
1383        let out6re = b67re_a - b67re_b;
1384        let out6im = b67im_a + b67im_b;
1385        let out7re = b67re_a + b67re_b;
1386        let out7im = b67im_a - b67im_b;
1387        let out8re = b58re_a + b58re_b;
1388        let out8im = b58im_a - b58im_b;
1389        let out9re = b49re_a + b49re_b;
1390        let out9im = b49im_a - b49im_b;
1391        let out10re = b310re_a + b310re_b;
1392        let out10im = b310im_a - b310im_b;
1393        let out11re = b211re_a + b211re_b;
1394        let out11im = b211im_a - b211im_b;
1395        let out12re = b112re_a + b112re_b;
1396        let out12im = b112im_a - b112im_b;
1397        buffer.store(sum, 0);
1398        buffer.store(
1399            Complex {
1400                re: out1re,
1401                im: out1im,
1402            },
1403            1,
1404        );
1405        buffer.store(
1406            Complex {
1407                re: out2re,
1408                im: out2im,
1409            },
1410            2,
1411        );
1412        buffer.store(
1413            Complex {
1414                re: out3re,
1415                im: out3im,
1416            },
1417            3,
1418        );
1419        buffer.store(
1420            Complex {
1421                re: out4re,
1422                im: out4im,
1423            },
1424            4,
1425        );
1426        buffer.store(
1427            Complex {
1428                re: out5re,
1429                im: out5im,
1430            },
1431            5,
1432        );
1433        buffer.store(
1434            Complex {
1435                re: out6re,
1436                im: out6im,
1437            },
1438            6,
1439        );
1440        buffer.store(
1441            Complex {
1442                re: out7re,
1443                im: out7im,
1444            },
1445            7,
1446        );
1447        buffer.store(
1448            Complex {
1449                re: out8re,
1450                im: out8im,
1451            },
1452            8,
1453        );
1454        buffer.store(
1455            Complex {
1456                re: out9re,
1457                im: out9im,
1458            },
1459            9,
1460        );
1461        buffer.store(
1462            Complex {
1463                re: out10re,
1464                im: out10im,
1465            },
1466            10,
1467        );
1468        buffer.store(
1469            Complex {
1470                re: out11re,
1471                im: out11im,
1472            },
1473            11,
1474        );
1475        buffer.store(
1476            Complex {
1477                re: out12re,
1478                im: out12im,
1479            },
1480            12,
1481        );
1482    }
1483}
1484
1485pub struct Butterfly16<T> {
1486    butterfly8: Butterfly8<T>,
1487    twiddle1: Complex<T>,
1488    twiddle2: Complex<T>,
1489    twiddle3: Complex<T>,
1490}
1491boilerplate_fft_butterfly!(Butterfly16, 16, |this: &Butterfly16<_>| this
1492    .butterfly8
1493    .fft_direction());
1494impl<T: FftNum> Butterfly16<T> {
1495    #[inline(always)]
1496    pub fn new(direction: FftDirection) -> Self {
1497        Self {
1498            butterfly8: Butterfly8::new(direction),
1499            twiddle1: twiddles::compute_twiddle(1, 16, direction),
1500            twiddle2: twiddles::compute_twiddle(2, 16, direction),
1501            twiddle3: twiddles::compute_twiddle(3, 16, direction),
1502        }
1503    }
1504
1505    #[inline(never)]
1506    unsafe fn perform_fft_contiguous(&self, mut buffer: impl LoadStore<T>) {
1507        let butterfly4 = Butterfly4::new(self.fft_direction());
1508
1509        // we're going to hardcode a step of split radix
1510        // step 1: copy and reorder the  input into the scratch
1511        let mut scratch_evens = [
1512            buffer.load(0),
1513            buffer.load(2),
1514            buffer.load(4),
1515            buffer.load(6),
1516            buffer.load(8),
1517            buffer.load(10),
1518            buffer.load(12),
1519            buffer.load(14),
1520        ];
1521
1522        let mut scratch_odds_n1 = [
1523            buffer.load(1),
1524            buffer.load(5),
1525            buffer.load(9),
1526            buffer.load(13),
1527        ];
1528        let mut scratch_odds_n3 = [
1529            buffer.load(15),
1530            buffer.load(3),
1531            buffer.load(7),
1532            buffer.load(11),
1533        ];
1534
1535        // step 2: column FFTs
1536        self.butterfly8.perform_fft_contiguous(&mut scratch_evens);
1537        butterfly4.perform_fft_contiguous(&mut scratch_odds_n1);
1538        butterfly4.perform_fft_contiguous(&mut scratch_odds_n3);
1539
1540        // step 3: apply twiddle factors
1541        scratch_odds_n1[1] = scratch_odds_n1[1] * self.twiddle1;
1542        scratch_odds_n3[1] = scratch_odds_n3[1] * self.twiddle1.conj();
1543
1544        scratch_odds_n1[2] = scratch_odds_n1[2] * self.twiddle2;
1545        scratch_odds_n3[2] = scratch_odds_n3[2] * self.twiddle2.conj();
1546
1547        scratch_odds_n1[3] = scratch_odds_n1[3] * self.twiddle3;
1548        scratch_odds_n3[3] = scratch_odds_n3[3] * self.twiddle3.conj();
1549
1550        // step 4: cross FFTs
1551        Butterfly2::perform_fft_strided(&mut scratch_odds_n1[0], &mut scratch_odds_n3[0]);
1552        Butterfly2::perform_fft_strided(&mut scratch_odds_n1[1], &mut scratch_odds_n3[1]);
1553        Butterfly2::perform_fft_strided(&mut scratch_odds_n1[2], &mut scratch_odds_n3[2]);
1554        Butterfly2::perform_fft_strided(&mut scratch_odds_n1[3], &mut scratch_odds_n3[3]);
1555
1556        // apply the butterfly 4 twiddle factor, which is just a rotation
1557        scratch_odds_n3[0] = twiddles::rotate_90(scratch_odds_n3[0], self.fft_direction());
1558        scratch_odds_n3[1] = twiddles::rotate_90(scratch_odds_n3[1], self.fft_direction());
1559        scratch_odds_n3[2] = twiddles::rotate_90(scratch_odds_n3[2], self.fft_direction());
1560        scratch_odds_n3[3] = twiddles::rotate_90(scratch_odds_n3[3], self.fft_direction());
1561
1562        //step 5: copy/add/subtract data back to buffer
1563        buffer.store(scratch_evens[0] + scratch_odds_n1[0], 0);
1564        buffer.store(scratch_evens[1] + scratch_odds_n1[1], 1);
1565        buffer.store(scratch_evens[2] + scratch_odds_n1[2], 2);
1566        buffer.store(scratch_evens[3] + scratch_odds_n1[3], 3);
1567        buffer.store(scratch_evens[4] + scratch_odds_n3[0], 4);
1568        buffer.store(scratch_evens[5] + scratch_odds_n3[1], 5);
1569        buffer.store(scratch_evens[6] + scratch_odds_n3[2], 6);
1570        buffer.store(scratch_evens[7] + scratch_odds_n3[3], 7);
1571        buffer.store(scratch_evens[0] - scratch_odds_n1[0], 8);
1572        buffer.store(scratch_evens[1] - scratch_odds_n1[1], 9);
1573        buffer.store(scratch_evens[2] - scratch_odds_n1[2], 10);
1574        buffer.store(scratch_evens[3] - scratch_odds_n1[3], 11);
1575        buffer.store(scratch_evens[4] - scratch_odds_n3[0], 12);
1576        buffer.store(scratch_evens[5] - scratch_odds_n3[1], 13);
1577        buffer.store(scratch_evens[6] - scratch_odds_n3[2], 14);
1578        buffer.store(scratch_evens[7] - scratch_odds_n3[3], 15);
1579    }
1580}
1581
1582pub struct Butterfly17<T> {
1583    twiddle1: Complex<T>,
1584    twiddle2: Complex<T>,
1585    twiddle3: Complex<T>,
1586    twiddle4: Complex<T>,
1587    twiddle5: Complex<T>,
1588    twiddle6: Complex<T>,
1589    twiddle7: Complex<T>,
1590    twiddle8: Complex<T>,
1591    direction: FftDirection,
1592}
1593boilerplate_fft_butterfly!(Butterfly17, 17, |this: &Butterfly17<_>| this.direction);
1594impl<T: FftNum> Butterfly17<T> {
1595    pub fn new(direction: FftDirection) -> Self {
1596        let twiddle1: Complex<T> = twiddles::compute_twiddle(1, 17, direction);
1597        let twiddle2: Complex<T> = twiddles::compute_twiddle(2, 17, direction);
1598        let twiddle3: Complex<T> = twiddles::compute_twiddle(3, 17, direction);
1599        let twiddle4: Complex<T> = twiddles::compute_twiddle(4, 17, direction);
1600        let twiddle5: Complex<T> = twiddles::compute_twiddle(5, 17, direction);
1601        let twiddle6: Complex<T> = twiddles::compute_twiddle(6, 17, direction);
1602        let twiddle7: Complex<T> = twiddles::compute_twiddle(7, 17, direction);
1603        let twiddle8: Complex<T> = twiddles::compute_twiddle(8, 17, direction);
1604        Self {
1605            twiddle1,
1606            twiddle2,
1607            twiddle3,
1608            twiddle4,
1609            twiddle5,
1610            twiddle6,
1611            twiddle7,
1612            twiddle8,
1613            direction,
1614        }
1615    }
1616
1617    #[inline(never)]
1618    unsafe fn perform_fft_contiguous(&self, mut buffer: impl LoadStore<T>) {
1619        // This function was derived in the same manner as the butterflies for length 3, 5 and 7.
1620        // However, instead of doing it by hand the actual code is autogenerated
1621        // with the `genbutterflies.py` script in the `tools` directory.
1622        let x116p = buffer.load(1) + buffer.load(16);
1623        let x116n = buffer.load(1) - buffer.load(16);
1624        let x215p = buffer.load(2) + buffer.load(15);
1625        let x215n = buffer.load(2) - buffer.load(15);
1626        let x314p = buffer.load(3) + buffer.load(14);
1627        let x314n = buffer.load(3) - buffer.load(14);
1628        let x413p = buffer.load(4) + buffer.load(13);
1629        let x413n = buffer.load(4) - buffer.load(13);
1630        let x512p = buffer.load(5) + buffer.load(12);
1631        let x512n = buffer.load(5) - buffer.load(12);
1632        let x611p = buffer.load(6) + buffer.load(11);
1633        let x611n = buffer.load(6) - buffer.load(11);
1634        let x710p = buffer.load(7) + buffer.load(10);
1635        let x710n = buffer.load(7) - buffer.load(10);
1636        let x89p = buffer.load(8) + buffer.load(9);
1637        let x89n = buffer.load(8) - buffer.load(9);
1638        let sum = buffer.load(0) + x116p + x215p + x314p + x413p + x512p + x611p + x710p + x89p;
1639        let b116re_a = buffer.load(0).re
1640            + self.twiddle1.re * x116p.re
1641            + self.twiddle2.re * x215p.re
1642            + self.twiddle3.re * x314p.re
1643            + self.twiddle4.re * x413p.re
1644            + self.twiddle5.re * x512p.re
1645            + self.twiddle6.re * x611p.re
1646            + self.twiddle7.re * x710p.re
1647            + self.twiddle8.re * x89p.re;
1648        let b116re_b = self.twiddle1.im * x116n.im
1649            + self.twiddle2.im * x215n.im
1650            + self.twiddle3.im * x314n.im
1651            + self.twiddle4.im * x413n.im
1652            + self.twiddle5.im * x512n.im
1653            + self.twiddle6.im * x611n.im
1654            + self.twiddle7.im * x710n.im
1655            + self.twiddle8.im * x89n.im;
1656        let b215re_a = buffer.load(0).re
1657            + self.twiddle2.re * x116p.re
1658            + self.twiddle4.re * x215p.re
1659            + self.twiddle6.re * x314p.re
1660            + self.twiddle8.re * x413p.re
1661            + self.twiddle7.re * x512p.re
1662            + self.twiddle5.re * x611p.re
1663            + self.twiddle3.re * x710p.re
1664            + self.twiddle1.re * x89p.re;
1665        let b215re_b = self.twiddle2.im * x116n.im
1666            + self.twiddle4.im * x215n.im
1667            + self.twiddle6.im * x314n.im
1668            + self.twiddle8.im * x413n.im
1669            + -self.twiddle7.im * x512n.im
1670            + -self.twiddle5.im * x611n.im
1671            + -self.twiddle3.im * x710n.im
1672            + -self.twiddle1.im * x89n.im;
1673        let b314re_a = buffer.load(0).re
1674            + self.twiddle3.re * x116p.re
1675            + self.twiddle6.re * x215p.re
1676            + self.twiddle8.re * x314p.re
1677            + self.twiddle5.re * x413p.re
1678            + self.twiddle2.re * x512p.re
1679            + self.twiddle1.re * x611p.re
1680            + self.twiddle4.re * x710p.re
1681            + self.twiddle7.re * x89p.re;
1682        let b314re_b = self.twiddle3.im * x116n.im
1683            + self.twiddle6.im * x215n.im
1684            + -self.twiddle8.im * x314n.im
1685            + -self.twiddle5.im * x413n.im
1686            + -self.twiddle2.im * x512n.im
1687            + self.twiddle1.im * x611n.im
1688            + self.twiddle4.im * x710n.im
1689            + self.twiddle7.im * x89n.im;
1690        let b413re_a = buffer.load(0).re
1691            + self.twiddle4.re * x116p.re
1692            + self.twiddle8.re * x215p.re
1693            + self.twiddle5.re * x314p.re
1694            + self.twiddle1.re * x413p.re
1695            + self.twiddle3.re * x512p.re
1696            + self.twiddle7.re * x611p.re
1697            + self.twiddle6.re * x710p.re
1698            + self.twiddle2.re * x89p.re;
1699        let b413re_b = self.twiddle4.im * x116n.im
1700            + self.twiddle8.im * x215n.im
1701            + -self.twiddle5.im * x314n.im
1702            + -self.twiddle1.im * x413n.im
1703            + self.twiddle3.im * x512n.im
1704            + self.twiddle7.im * x611n.im
1705            + -self.twiddle6.im * x710n.im
1706            + -self.twiddle2.im * x89n.im;
1707        let b512re_a = buffer.load(0).re
1708            + self.twiddle5.re * x116p.re
1709            + self.twiddle7.re * x215p.re
1710            + self.twiddle2.re * x314p.re
1711            + self.twiddle3.re * x413p.re
1712            + self.twiddle8.re * x512p.re
1713            + self.twiddle4.re * x611p.re
1714            + self.twiddle1.re * x710p.re
1715            + self.twiddle6.re * x89p.re;
1716        let b512re_b = self.twiddle5.im * x116n.im
1717            + -self.twiddle7.im * x215n.im
1718            + -self.twiddle2.im * x314n.im
1719            + self.twiddle3.im * x413n.im
1720            + self.twiddle8.im * x512n.im
1721            + -self.twiddle4.im * x611n.im
1722            + self.twiddle1.im * x710n.im
1723            + self.twiddle6.im * x89n.im;
1724        let b611re_a = buffer.load(0).re
1725            + self.twiddle6.re * x116p.re
1726            + self.twiddle5.re * x215p.re
1727            + self.twiddle1.re * x314p.re
1728            + self.twiddle7.re * x413p.re
1729            + self.twiddle4.re * x512p.re
1730            + self.twiddle2.re * x611p.re
1731            + self.twiddle8.re * x710p.re
1732            + self.twiddle3.re * x89p.re;
1733        let b611re_b = self.twiddle6.im * x116n.im
1734            + -self.twiddle5.im * x215n.im
1735            + self.twiddle1.im * x314n.im
1736            + self.twiddle7.im * x413n.im
1737            + -self.twiddle4.im * x512n.im
1738            + self.twiddle2.im * x611n.im
1739            + self.twiddle8.im * x710n.im
1740            + -self.twiddle3.im * x89n.im;
1741        let b710re_a = buffer.load(0).re
1742            + self.twiddle7.re * x116p.re
1743            + self.twiddle3.re * x215p.re
1744            + self.twiddle4.re * x314p.re
1745            + self.twiddle6.re * x413p.re
1746            + self.twiddle1.re * x512p.re
1747            + self.twiddle8.re * x611p.re
1748            + self.twiddle2.re * x710p.re
1749            + self.twiddle5.re * x89p.re;
1750        let b710re_b = self.twiddle7.im * x116n.im
1751            + -self.twiddle3.im * x215n.im
1752            + self.twiddle4.im * x314n.im
1753            + -self.twiddle6.im * x413n.im
1754            + self.twiddle1.im * x512n.im
1755            + self.twiddle8.im * x611n.im
1756            + -self.twiddle2.im * x710n.im
1757            + self.twiddle5.im * x89n.im;
1758        let b89re_a = buffer.load(0).re
1759            + self.twiddle8.re * x116p.re
1760            + self.twiddle1.re * x215p.re
1761            + self.twiddle7.re * x314p.re
1762            + self.twiddle2.re * x413p.re
1763            + self.twiddle6.re * x512p.re
1764            + self.twiddle3.re * x611p.re
1765            + self.twiddle5.re * x710p.re
1766            + self.twiddle4.re * x89p.re;
1767        let b89re_b = self.twiddle8.im * x116n.im
1768            + -self.twiddle1.im * x215n.im
1769            + self.twiddle7.im * x314n.im
1770            + -self.twiddle2.im * x413n.im
1771            + self.twiddle6.im * x512n.im
1772            + -self.twiddle3.im * x611n.im
1773            + self.twiddle5.im * x710n.im
1774            + -self.twiddle4.im * x89n.im;
1775
1776        let b116im_a = buffer.load(0).im
1777            + self.twiddle1.re * x116p.im
1778            + self.twiddle2.re * x215p.im
1779            + self.twiddle3.re * x314p.im
1780            + self.twiddle4.re * x413p.im
1781            + self.twiddle5.re * x512p.im
1782            + self.twiddle6.re * x611p.im
1783            + self.twiddle7.re * x710p.im
1784            + self.twiddle8.re * x89p.im;
1785        let b116im_b = self.twiddle1.im * x116n.re
1786            + self.twiddle2.im * x215n.re
1787            + self.twiddle3.im * x314n.re
1788            + self.twiddle4.im * x413n.re
1789            + self.twiddle5.im * x512n.re
1790            + self.twiddle6.im * x611n.re
1791            + self.twiddle7.im * x710n.re
1792            + self.twiddle8.im * x89n.re;
1793        let b215im_a = buffer.load(0).im
1794            + self.twiddle2.re * x116p.im
1795            + self.twiddle4.re * x215p.im
1796            + self.twiddle6.re * x314p.im
1797            + self.twiddle8.re * x413p.im
1798            + self.twiddle7.re * x512p.im
1799            + self.twiddle5.re * x611p.im
1800            + self.twiddle3.re * x710p.im
1801            + self.twiddle1.re * x89p.im;
1802        let b215im_b = self.twiddle2.im * x116n.re
1803            + self.twiddle4.im * x215n.re
1804            + self.twiddle6.im * x314n.re
1805            + self.twiddle8.im * x413n.re
1806            + -self.twiddle7.im * x512n.re
1807            + -self.twiddle5.im * x611n.re
1808            + -self.twiddle3.im * x710n.re
1809            + -self.twiddle1.im * x89n.re;
1810        let b314im_a = buffer.load(0).im
1811            + self.twiddle3.re * x116p.im
1812            + self.twiddle6.re * x215p.im
1813            + self.twiddle8.re * x314p.im
1814            + self.twiddle5.re * x413p.im
1815            + self.twiddle2.re * x512p.im
1816            + self.twiddle1.re * x611p.im
1817            + self.twiddle4.re * x710p.im
1818            + self.twiddle7.re * x89p.im;
1819        let b314im_b = self.twiddle3.im * x116n.re
1820            + self.twiddle6.im * x215n.re
1821            + -self.twiddle8.im * x314n.re
1822            + -self.twiddle5.im * x413n.re
1823            + -self.twiddle2.im * x512n.re
1824            + self.twiddle1.im * x611n.re
1825            + self.twiddle4.im * x710n.re
1826            + self.twiddle7.im * x89n.re;
1827        let b413im_a = buffer.load(0).im
1828            + self.twiddle4.re * x116p.im
1829            + self.twiddle8.re * x215p.im
1830            + self.twiddle5.re * x314p.im
1831            + self.twiddle1.re * x413p.im
1832            + self.twiddle3.re * x512p.im
1833            + self.twiddle7.re * x611p.im
1834            + self.twiddle6.re * x710p.im
1835            + self.twiddle2.re * x89p.im;
1836        let b413im_b = self.twiddle4.im * x116n.re
1837            + self.twiddle8.im * x215n.re
1838            + -self.twiddle5.im * x314n.re
1839            + -self.twiddle1.im * x413n.re
1840            + self.twiddle3.im * x512n.re
1841            + self.twiddle7.im * x611n.re
1842            + -self.twiddle6.im * x710n.re
1843            + -self.twiddle2.im * x89n.re;
1844        let b512im_a = buffer.load(0).im
1845            + self.twiddle5.re * x116p.im
1846            + self.twiddle7.re * x215p.im
1847            + self.twiddle2.re * x314p.im
1848            + self.twiddle3.re * x413p.im
1849            + self.twiddle8.re * x512p.im
1850            + self.twiddle4.re * x611p.im
1851            + self.twiddle1.re * x710p.im
1852            + self.twiddle6.re * x89p.im;
1853        let b512im_b = self.twiddle5.im * x116n.re
1854            + -self.twiddle7.im * x215n.re
1855            + -self.twiddle2.im * x314n.re
1856            + self.twiddle3.im * x413n.re
1857            + self.twiddle8.im * x512n.re
1858            + -self.twiddle4.im * x611n.re
1859            + self.twiddle1.im * x710n.re
1860            + self.twiddle6.im * x89n.re;
1861        let b611im_a = buffer.load(0).im
1862            + self.twiddle6.re * x116p.im
1863            + self.twiddle5.re * x215p.im
1864            + self.twiddle1.re * x314p.im
1865            + self.twiddle7.re * x413p.im
1866            + self.twiddle4.re * x512p.im
1867            + self.twiddle2.re * x611p.im
1868            + self.twiddle8.re * x710p.im
1869            + self.twiddle3.re * x89p.im;
1870        let b611im_b = self.twiddle6.im * x116n.re
1871            + -self.twiddle5.im * x215n.re
1872            + self.twiddle1.im * x314n.re
1873            + self.twiddle7.im * x413n.re
1874            + -self.twiddle4.im * x512n.re
1875            + self.twiddle2.im * x611n.re
1876            + self.twiddle8.im * x710n.re
1877            + -self.twiddle3.im * x89n.re;
1878        let b710im_a = buffer.load(0).im
1879            + self.twiddle7.re * x116p.im
1880            + self.twiddle3.re * x215p.im
1881            + self.twiddle4.re * x314p.im
1882            + self.twiddle6.re * x413p.im
1883            + self.twiddle1.re * x512p.im
1884            + self.twiddle8.re * x611p.im
1885            + self.twiddle2.re * x710p.im
1886            + self.twiddle5.re * x89p.im;
1887        let b710im_b = self.twiddle7.im * x116n.re
1888            + -self.twiddle3.im * x215n.re
1889            + self.twiddle4.im * x314n.re
1890            + -self.twiddle6.im * x413n.re
1891            + self.twiddle1.im * x512n.re
1892            + self.twiddle8.im * x611n.re
1893            + -self.twiddle2.im * x710n.re
1894            + self.twiddle5.im * x89n.re;
1895        let b89im_a = buffer.load(0).im
1896            + self.twiddle8.re * x116p.im
1897            + self.twiddle1.re * x215p.im
1898            + self.twiddle7.re * x314p.im
1899            + self.twiddle2.re * x413p.im
1900            + self.twiddle6.re * x512p.im
1901            + self.twiddle3.re * x611p.im
1902            + self.twiddle5.re * x710p.im
1903            + self.twiddle4.re * x89p.im;
1904        let b89im_b = self.twiddle8.im * x116n.re
1905            + -self.twiddle1.im * x215n.re
1906            + self.twiddle7.im * x314n.re
1907            + -self.twiddle2.im * x413n.re
1908            + self.twiddle6.im * x512n.re
1909            + -self.twiddle3.im * x611n.re
1910            + self.twiddle5.im * x710n.re
1911            + -self.twiddle4.im * x89n.re;
1912
1913        let out1re = b116re_a - b116re_b;
1914        let out1im = b116im_a + b116im_b;
1915        let out2re = b215re_a - b215re_b;
1916        let out2im = b215im_a + b215im_b;
1917        let out3re = b314re_a - b314re_b;
1918        let out3im = b314im_a + b314im_b;
1919        let out4re = b413re_a - b413re_b;
1920        let out4im = b413im_a + b413im_b;
1921        let out5re = b512re_a - b512re_b;
1922        let out5im = b512im_a + b512im_b;
1923        let out6re = b611re_a - b611re_b;
1924        let out6im = b611im_a + b611im_b;
1925        let out7re = b710re_a - b710re_b;
1926        let out7im = b710im_a + b710im_b;
1927        let out8re = b89re_a - b89re_b;
1928        let out8im = b89im_a + b89im_b;
1929        let out9re = b89re_a + b89re_b;
1930        let out9im = b89im_a - b89im_b;
1931        let out10re = b710re_a + b710re_b;
1932        let out10im = b710im_a - b710im_b;
1933        let out11re = b611re_a + b611re_b;
1934        let out11im = b611im_a - b611im_b;
1935        let out12re = b512re_a + b512re_b;
1936        let out12im = b512im_a - b512im_b;
1937        let out13re = b413re_a + b413re_b;
1938        let out13im = b413im_a - b413im_b;
1939        let out14re = b314re_a + b314re_b;
1940        let out14im = b314im_a - b314im_b;
1941        let out15re = b215re_a + b215re_b;
1942        let out15im = b215im_a - b215im_b;
1943        let out16re = b116re_a + b116re_b;
1944        let out16im = b116im_a - b116im_b;
1945        buffer.store(sum, 0);
1946        buffer.store(
1947            Complex {
1948                re: out1re,
1949                im: out1im,
1950            },
1951            1,
1952        );
1953        buffer.store(
1954            Complex {
1955                re: out2re,
1956                im: out2im,
1957            },
1958            2,
1959        );
1960        buffer.store(
1961            Complex {
1962                re: out3re,
1963                im: out3im,
1964            },
1965            3,
1966        );
1967        buffer.store(
1968            Complex {
1969                re: out4re,
1970                im: out4im,
1971            },
1972            4,
1973        );
1974        buffer.store(
1975            Complex {
1976                re: out5re,
1977                im: out5im,
1978            },
1979            5,
1980        );
1981        buffer.store(
1982            Complex {
1983                re: out6re,
1984                im: out6im,
1985            },
1986            6,
1987        );
1988        buffer.store(
1989            Complex {
1990                re: out7re,
1991                im: out7im,
1992            },
1993            7,
1994        );
1995        buffer.store(
1996            Complex {
1997                re: out8re,
1998                im: out8im,
1999            },
2000            8,
2001        );
2002        buffer.store(
2003            Complex {
2004                re: out9re,
2005                im: out9im,
2006            },
2007            9,
2008        );
2009        buffer.store(
2010            Complex {
2011                re: out10re,
2012                im: out10im,
2013            },
2014            10,
2015        );
2016        buffer.store(
2017            Complex {
2018                re: out11re,
2019                im: out11im,
2020            },
2021            11,
2022        );
2023        buffer.store(
2024            Complex {
2025                re: out12re,
2026                im: out12im,
2027            },
2028            12,
2029        );
2030        buffer.store(
2031            Complex {
2032                re: out13re,
2033                im: out13im,
2034            },
2035            13,
2036        );
2037        buffer.store(
2038            Complex {
2039                re: out14re,
2040                im: out14im,
2041            },
2042            14,
2043        );
2044        buffer.store(
2045            Complex {
2046                re: out15re,
2047                im: out15im,
2048            },
2049            15,
2050        );
2051        buffer.store(
2052            Complex {
2053                re: out16re,
2054                im: out16im,
2055            },
2056            16,
2057        );
2058    }
2059}
2060
2061pub struct Butterfly19<T> {
2062    twiddle1: Complex<T>,
2063    twiddle2: Complex<T>,
2064    twiddle3: Complex<T>,
2065    twiddle4: Complex<T>,
2066    twiddle5: Complex<T>,
2067    twiddle6: Complex<T>,
2068    twiddle7: Complex<T>,
2069    twiddle8: Complex<T>,
2070    twiddle9: Complex<T>,
2071    direction: FftDirection,
2072}
2073boilerplate_fft_butterfly!(Butterfly19, 19, |this: &Butterfly19<_>| this.direction);
2074impl<T: FftNum> Butterfly19<T> {
2075    pub fn new(direction: FftDirection) -> Self {
2076        let twiddle1: Complex<T> = twiddles::compute_twiddle(1, 19, direction);
2077        let twiddle2: Complex<T> = twiddles::compute_twiddle(2, 19, direction);
2078        let twiddle3: Complex<T> = twiddles::compute_twiddle(3, 19, direction);
2079        let twiddle4: Complex<T> = twiddles::compute_twiddle(4, 19, direction);
2080        let twiddle5: Complex<T> = twiddles::compute_twiddle(5, 19, direction);
2081        let twiddle6: Complex<T> = twiddles::compute_twiddle(6, 19, direction);
2082        let twiddle7: Complex<T> = twiddles::compute_twiddle(7, 19, direction);
2083        let twiddle8: Complex<T> = twiddles::compute_twiddle(8, 19, direction);
2084        let twiddle9: Complex<T> = twiddles::compute_twiddle(9, 19, direction);
2085        Self {
2086            twiddle1,
2087            twiddle2,
2088            twiddle3,
2089            twiddle4,
2090            twiddle5,
2091            twiddle6,
2092            twiddle7,
2093            twiddle8,
2094            twiddle9,
2095            direction,
2096        }
2097    }
2098
2099    #[inline(never)]
2100    unsafe fn perform_fft_contiguous(&self, mut buffer: impl LoadStore<T>) {
2101        // This function was derived in the same manner as the butterflies for length 3, 5 and 7.
2102        // However, instead of doing it by hand the actual code is autogenerated
2103        // with the `genbutterflies.py` script in the `tools` directory.
2104        let x118p = buffer.load(1) + buffer.load(18);
2105        let x118n = buffer.load(1) - buffer.load(18);
2106        let x217p = buffer.load(2) + buffer.load(17);
2107        let x217n = buffer.load(2) - buffer.load(17);
2108        let x316p = buffer.load(3) + buffer.load(16);
2109        let x316n = buffer.load(3) - buffer.load(16);
2110        let x415p = buffer.load(4) + buffer.load(15);
2111        let x415n = buffer.load(4) - buffer.load(15);
2112        let x514p = buffer.load(5) + buffer.load(14);
2113        let x514n = buffer.load(5) - buffer.load(14);
2114        let x613p = buffer.load(6) + buffer.load(13);
2115        let x613n = buffer.load(6) - buffer.load(13);
2116        let x712p = buffer.load(7) + buffer.load(12);
2117        let x712n = buffer.load(7) - buffer.load(12);
2118        let x811p = buffer.load(8) + buffer.load(11);
2119        let x811n = buffer.load(8) - buffer.load(11);
2120        let x910p = buffer.load(9) + buffer.load(10);
2121        let x910n = buffer.load(9) - buffer.load(10);
2122        let sum =
2123            buffer.load(0) + x118p + x217p + x316p + x415p + x514p + x613p + x712p + x811p + x910p;
2124        let b118re_a = buffer.load(0).re
2125            + self.twiddle1.re * x118p.re
2126            + self.twiddle2.re * x217p.re
2127            + self.twiddle3.re * x316p.re
2128            + self.twiddle4.re * x415p.re
2129            + self.twiddle5.re * x514p.re
2130            + self.twiddle6.re * x613p.re
2131            + self.twiddle7.re * x712p.re
2132            + self.twiddle8.re * x811p.re
2133            + self.twiddle9.re * x910p.re;
2134        let b118re_b = self.twiddle1.im * x118n.im
2135            + self.twiddle2.im * x217n.im
2136            + self.twiddle3.im * x316n.im
2137            + self.twiddle4.im * x415n.im
2138            + self.twiddle5.im * x514n.im
2139            + self.twiddle6.im * x613n.im
2140            + self.twiddle7.im * x712n.im
2141            + self.twiddle8.im * x811n.im
2142            + self.twiddle9.im * x910n.im;
2143        let b217re_a = buffer.load(0).re
2144            + self.twiddle2.re * x118p.re
2145            + self.twiddle4.re * x217p.re
2146            + self.twiddle6.re * x316p.re
2147            + self.twiddle8.re * x415p.re
2148            + self.twiddle9.re * x514p.re
2149            + self.twiddle7.re * x613p.re
2150            + self.twiddle5.re * x712p.re
2151            + self.twiddle3.re * x811p.re
2152            + self.twiddle1.re * x910p.re;
2153        let b217re_b = self.twiddle2.im * x118n.im
2154            + self.twiddle4.im * x217n.im
2155            + self.twiddle6.im * x316n.im
2156            + self.twiddle8.im * x415n.im
2157            + -self.twiddle9.im * x514n.im
2158            + -self.twiddle7.im * x613n.im
2159            + -self.twiddle5.im * x712n.im
2160            + -self.twiddle3.im * x811n.im
2161            + -self.twiddle1.im * x910n.im;
2162        let b316re_a = buffer.load(0).re
2163            + self.twiddle3.re * x118p.re
2164            + self.twiddle6.re * x217p.re
2165            + self.twiddle9.re * x316p.re
2166            + self.twiddle7.re * x415p.re
2167            + self.twiddle4.re * x514p.re
2168            + self.twiddle1.re * x613p.re
2169            + self.twiddle2.re * x712p.re
2170            + self.twiddle5.re * x811p.re
2171            + self.twiddle8.re * x910p.re;
2172        let b316re_b = self.twiddle3.im * x118n.im
2173            + self.twiddle6.im * x217n.im
2174            + self.twiddle9.im * x316n.im
2175            + -self.twiddle7.im * x415n.im
2176            + -self.twiddle4.im * x514n.im
2177            + -self.twiddle1.im * x613n.im
2178            + self.twiddle2.im * x712n.im
2179            + self.twiddle5.im * x811n.im
2180            + self.twiddle8.im * x910n.im;
2181        let b415re_a = buffer.load(0).re
2182            + self.twiddle4.re * x118p.re
2183            + self.twiddle8.re * x217p.re
2184            + self.twiddle7.re * x316p.re
2185            + self.twiddle3.re * x415p.re
2186            + self.twiddle1.re * x514p.re
2187            + self.twiddle5.re * x613p.re
2188            + self.twiddle9.re * x712p.re
2189            + self.twiddle6.re * x811p.re
2190            + self.twiddle2.re * x910p.re;
2191        let b415re_b = self.twiddle4.im * x118n.im
2192            + self.twiddle8.im * x217n.im
2193            + -self.twiddle7.im * x316n.im
2194            + -self.twiddle3.im * x415n.im
2195            + self.twiddle1.im * x514n.im
2196            + self.twiddle5.im * x613n.im
2197            + self.twiddle9.im * x712n.im
2198            + -self.twiddle6.im * x811n.im
2199            + -self.twiddle2.im * x910n.im;
2200        let b514re_a = buffer.load(0).re
2201            + self.twiddle5.re * x118p.re
2202            + self.twiddle9.re * x217p.re
2203            + self.twiddle4.re * x316p.re
2204            + self.twiddle1.re * x415p.re
2205            + self.twiddle6.re * x514p.re
2206            + self.twiddle8.re * x613p.re
2207            + self.twiddle3.re * x712p.re
2208            + self.twiddle2.re * x811p.re
2209            + self.twiddle7.re * x910p.re;
2210        let b514re_b = self.twiddle5.im * x118n.im
2211            + -self.twiddle9.im * x217n.im
2212            + -self.twiddle4.im * x316n.im
2213            + self.twiddle1.im * x415n.im
2214            + self.twiddle6.im * x514n.im
2215            + -self.twiddle8.im * x613n.im
2216            + -self.twiddle3.im * x712n.im
2217            + self.twiddle2.im * x811n.im
2218            + self.twiddle7.im * x910n.im;
2219        let b613re_a = buffer.load(0).re
2220            + self.twiddle6.re * x118p.re
2221            + self.twiddle7.re * x217p.re
2222            + self.twiddle1.re * x316p.re
2223            + self.twiddle5.re * x415p.re
2224            + self.twiddle8.re * x514p.re
2225            + self.twiddle2.re * x613p.re
2226            + self.twiddle4.re * x712p.re
2227            + self.twiddle9.re * x811p.re
2228            + self.twiddle3.re * x910p.re;
2229        let b613re_b = self.twiddle6.im * x118n.im
2230            + -self.twiddle7.im * x217n.im
2231            + -self.twiddle1.im * x316n.im
2232            + self.twiddle5.im * x415n.im
2233            + -self.twiddle8.im * x514n.im
2234            + -self.twiddle2.im * x613n.im
2235            + self.twiddle4.im * x712n.im
2236            + -self.twiddle9.im * x811n.im
2237            + -self.twiddle3.im * x910n.im;
2238        let b712re_a = buffer.load(0).re
2239            + self.twiddle7.re * x118p.re
2240            + self.twiddle5.re * x217p.re
2241            + self.twiddle2.re * x316p.re
2242            + self.twiddle9.re * x415p.re
2243            + self.twiddle3.re * x514p.re
2244            + self.twiddle4.re * x613p.re
2245            + self.twiddle8.re * x712p.re
2246            + self.twiddle1.re * x811p.re
2247            + self.twiddle6.re * x910p.re;
2248        let b712re_b = self.twiddle7.im * x118n.im
2249            + -self.twiddle5.im * x217n.im
2250            + self.twiddle2.im * x316n.im
2251            + self.twiddle9.im * x415n.im
2252            + -self.twiddle3.im * x514n.im
2253            + self.twiddle4.im * x613n.im
2254            + -self.twiddle8.im * x712n.im
2255            + -self.twiddle1.im * x811n.im
2256            + self.twiddle6.im * x910n.im;
2257        let b811re_a = buffer.load(0).re
2258            + self.twiddle8.re * x118p.re
2259            + self.twiddle3.re * x217p.re
2260            + self.twiddle5.re * x316p.re
2261            + self.twiddle6.re * x415p.re
2262            + self.twiddle2.re * x514p.re
2263            + self.twiddle9.re * x613p.re
2264            + self.twiddle1.re * x712p.re
2265            + self.twiddle7.re * x811p.re
2266            + self.twiddle4.re * x910p.re;
2267        let b811re_b = self.twiddle8.im * x118n.im
2268            + -self.twiddle3.im * x217n.im
2269            + self.twiddle5.im * x316n.im
2270            + -self.twiddle6.im * x415n.im
2271            + self.twiddle2.im * x514n.im
2272            + -self.twiddle9.im * x613n.im
2273            + -self.twiddle1.im * x712n.im
2274            + self.twiddle7.im * x811n.im
2275            + -self.twiddle4.im * x910n.im;
2276        let b910re_a = buffer.load(0).re
2277            + self.twiddle9.re * x118p.re
2278            + self.twiddle1.re * x217p.re
2279            + self.twiddle8.re * x316p.re
2280            + self.twiddle2.re * x415p.re
2281            + self.twiddle7.re * x514p.re
2282            + self.twiddle3.re * x613p.re
2283            + self.twiddle6.re * x712p.re
2284            + self.twiddle4.re * x811p.re
2285            + self.twiddle5.re * x910p.re;
2286        let b910re_b = self.twiddle9.im * x118n.im
2287            + -self.twiddle1.im * x217n.im
2288            + self.twiddle8.im * x316n.im
2289            + -self.twiddle2.im * x415n.im
2290            + self.twiddle7.im * x514n.im
2291            + -self.twiddle3.im * x613n.im
2292            + self.twiddle6.im * x712n.im
2293            + -self.twiddle4.im * x811n.im
2294            + self.twiddle5.im * x910n.im;
2295
2296        let b118im_a = buffer.load(0).im
2297            + self.twiddle1.re * x118p.im
2298            + self.twiddle2.re * x217p.im
2299            + self.twiddle3.re * x316p.im
2300            + self.twiddle4.re * x415p.im
2301            + self.twiddle5.re * x514p.im
2302            + self.twiddle6.re * x613p.im
2303            + self.twiddle7.re * x712p.im
2304            + self.twiddle8.re * x811p.im
2305            + self.twiddle9.re * x910p.im;
2306        let b118im_b = self.twiddle1.im * x118n.re
2307            + self.twiddle2.im * x217n.re
2308            + self.twiddle3.im * x316n.re
2309            + self.twiddle4.im * x415n.re
2310            + self.twiddle5.im * x514n.re
2311            + self.twiddle6.im * x613n.re
2312            + self.twiddle7.im * x712n.re
2313            + self.twiddle8.im * x811n.re
2314            + self.twiddle9.im * x910n.re;
2315        let b217im_a = buffer.load(0).im
2316            + self.twiddle2.re * x118p.im
2317            + self.twiddle4.re * x217p.im
2318            + self.twiddle6.re * x316p.im
2319            + self.twiddle8.re * x415p.im
2320            + self.twiddle9.re * x514p.im
2321            + self.twiddle7.re * x613p.im
2322            + self.twiddle5.re * x712p.im
2323            + self.twiddle3.re * x811p.im
2324            + self.twiddle1.re * x910p.im;
2325        let b217im_b = self.twiddle2.im * x118n.re
2326            + self.twiddle4.im * x217n.re
2327            + self.twiddle6.im * x316n.re
2328            + self.twiddle8.im * x415n.re
2329            + -self.twiddle9.im * x514n.re
2330            + -self.twiddle7.im * x613n.re
2331            + -self.twiddle5.im * x712n.re
2332            + -self.twiddle3.im * x811n.re
2333            + -self.twiddle1.im * x910n.re;
2334        let b316im_a = buffer.load(0).im
2335            + self.twiddle3.re * x118p.im
2336            + self.twiddle6.re * x217p.im
2337            + self.twiddle9.re * x316p.im
2338            + self.twiddle7.re * x415p.im
2339            + self.twiddle4.re * x514p.im
2340            + self.twiddle1.re * x613p.im
2341            + self.twiddle2.re * x712p.im
2342            + self.twiddle5.re * x811p.im
2343            + self.twiddle8.re * x910p.im;
2344        let b316im_b = self.twiddle3.im * x118n.re
2345            + self.twiddle6.im * x217n.re
2346            + self.twiddle9.im * x316n.re
2347            + -self.twiddle7.im * x415n.re
2348            + -self.twiddle4.im * x514n.re
2349            + -self.twiddle1.im * x613n.re
2350            + self.twiddle2.im * x712n.re
2351            + self.twiddle5.im * x811n.re
2352            + self.twiddle8.im * x910n.re;
2353        let b415im_a = buffer.load(0).im
2354            + self.twiddle4.re * x118p.im
2355            + self.twiddle8.re * x217p.im
2356            + self.twiddle7.re * x316p.im
2357            + self.twiddle3.re * x415p.im
2358            + self.twiddle1.re * x514p.im
2359            + self.twiddle5.re * x613p.im
2360            + self.twiddle9.re * x712p.im
2361            + self.twiddle6.re * x811p.im
2362            + self.twiddle2.re * x910p.im;
2363        let b415im_b = self.twiddle4.im * x118n.re
2364            + self.twiddle8.im * x217n.re
2365            + -self.twiddle7.im * x316n.re
2366            + -self.twiddle3.im * x415n.re
2367            + self.twiddle1.im * x514n.re
2368            + self.twiddle5.im * x613n.re
2369            + self.twiddle9.im * x712n.re
2370            + -self.twiddle6.im * x811n.re
2371            + -self.twiddle2.im * x910n.re;
2372        let b514im_a = buffer.load(0).im
2373            + self.twiddle5.re * x118p.im
2374            + self.twiddle9.re * x217p.im
2375            + self.twiddle4.re * x316p.im
2376            + self.twiddle1.re * x415p.im
2377            + self.twiddle6.re * x514p.im
2378            + self.twiddle8.re * x613p.im
2379            + self.twiddle3.re * x712p.im
2380            + self.twiddle2.re * x811p.im
2381            + self.twiddle7.re * x910p.im;
2382        let b514im_b = self.twiddle5.im * x118n.re
2383            + -self.twiddle9.im * x217n.re
2384            + -self.twiddle4.im * x316n.re
2385            + self.twiddle1.im * x415n.re
2386            + self.twiddle6.im * x514n.re
2387            + -self.twiddle8.im * x613n.re
2388            + -self.twiddle3.im * x712n.re
2389            + self.twiddle2.im * x811n.re
2390            + self.twiddle7.im * x910n.re;
2391        let b613im_a = buffer.load(0).im
2392            + self.twiddle6.re * x118p.im
2393            + self.twiddle7.re * x217p.im
2394            + self.twiddle1.re * x316p.im
2395            + self.twiddle5.re * x415p.im
2396            + self.twiddle8.re * x514p.im
2397            + self.twiddle2.re * x613p.im
2398            + self.twiddle4.re * x712p.im
2399            + self.twiddle9.re * x811p.im
2400            + self.twiddle3.re * x910p.im;
2401        let b613im_b = self.twiddle6.im * x118n.re
2402            + -self.twiddle7.im * x217n.re
2403            + -self.twiddle1.im * x316n.re
2404            + self.twiddle5.im * x415n.re
2405            + -self.twiddle8.im * x514n.re
2406            + -self.twiddle2.im * x613n.re
2407            + self.twiddle4.im * x712n.re
2408            + -self.twiddle9.im * x811n.re
2409            + -self.twiddle3.im * x910n.re;
2410        let b712im_a = buffer.load(0).im
2411            + self.twiddle7.re * x118p.im
2412            + self.twiddle5.re * x217p.im
2413            + self.twiddle2.re * x316p.im
2414            + self.twiddle9.re * x415p.im
2415            + self.twiddle3.re * x514p.im
2416            + self.twiddle4.re * x613p.im
2417            + self.twiddle8.re * x712p.im
2418            + self.twiddle1.re * x811p.im
2419            + self.twiddle6.re * x910p.im;
2420        let b712im_b = self.twiddle7.im * x118n.re
2421            + -self.twiddle5.im * x217n.re
2422            + self.twiddle2.im * x316n.re
2423            + self.twiddle9.im * x415n.re
2424            + -self.twiddle3.im * x514n.re
2425            + self.twiddle4.im * x613n.re
2426            + -self.twiddle8.im * x712n.re
2427            + -self.twiddle1.im * x811n.re
2428            + self.twiddle6.im * x910n.re;
2429        let b811im_a = buffer.load(0).im
2430            + self.twiddle8.re * x118p.im
2431            + self.twiddle3.re * x217p.im
2432            + self.twiddle5.re * x316p.im
2433            + self.twiddle6.re * x415p.im
2434            + self.twiddle2.re * x514p.im
2435            + self.twiddle9.re * x613p.im
2436            + self.twiddle1.re * x712p.im
2437            + self.twiddle7.re * x811p.im
2438            + self.twiddle4.re * x910p.im;
2439        let b811im_b = self.twiddle8.im * x118n.re
2440            + -self.twiddle3.im * x217n.re
2441            + self.twiddle5.im * x316n.re
2442            + -self.twiddle6.im * x415n.re
2443            + self.twiddle2.im * x514n.re
2444            + -self.twiddle9.im * x613n.re
2445            + -self.twiddle1.im * x712n.re
2446            + self.twiddle7.im * x811n.re
2447            + -self.twiddle4.im * x910n.re;
2448        let b910im_a = buffer.load(0).im
2449            + self.twiddle9.re * x118p.im
2450            + self.twiddle1.re * x217p.im
2451            + self.twiddle8.re * x316p.im
2452            + self.twiddle2.re * x415p.im
2453            + self.twiddle7.re * x514p.im
2454            + self.twiddle3.re * x613p.im
2455            + self.twiddle6.re * x712p.im
2456            + self.twiddle4.re * x811p.im
2457            + self.twiddle5.re * x910p.im;
2458        let b910im_b = self.twiddle9.im * x118n.re
2459            + -self.twiddle1.im * x217n.re
2460            + self.twiddle8.im * x316n.re
2461            + -self.twiddle2.im * x415n.re
2462            + self.twiddle7.im * x514n.re
2463            + -self.twiddle3.im * x613n.re
2464            + self.twiddle6.im * x712n.re
2465            + -self.twiddle4.im * x811n.re
2466            + self.twiddle5.im * x910n.re;
2467
2468        let out1re = b118re_a - b118re_b;
2469        let out1im = b118im_a + b118im_b;
2470        let out2re = b217re_a - b217re_b;
2471        let out2im = b217im_a + b217im_b;
2472        let out3re = b316re_a - b316re_b;
2473        let out3im = b316im_a + b316im_b;
2474        let out4re = b415re_a - b415re_b;
2475        let out4im = b415im_a + b415im_b;
2476        let out5re = b514re_a - b514re_b;
2477        let out5im = b514im_a + b514im_b;
2478        let out6re = b613re_a - b613re_b;
2479        let out6im = b613im_a + b613im_b;
2480        let out7re = b712re_a - b712re_b;
2481        let out7im = b712im_a + b712im_b;
2482        let out8re = b811re_a - b811re_b;
2483        let out8im = b811im_a + b811im_b;
2484        let out9re = b910re_a - b910re_b;
2485        let out9im = b910im_a + b910im_b;
2486        let out10re = b910re_a + b910re_b;
2487        let out10im = b910im_a - b910im_b;
2488        let out11re = b811re_a + b811re_b;
2489        let out11im = b811im_a - b811im_b;
2490        let out12re = b712re_a + b712re_b;
2491        let out12im = b712im_a - b712im_b;
2492        let out13re = b613re_a + b613re_b;
2493        let out13im = b613im_a - b613im_b;
2494        let out14re = b514re_a + b514re_b;
2495        let out14im = b514im_a - b514im_b;
2496        let out15re = b415re_a + b415re_b;
2497        let out15im = b415im_a - b415im_b;
2498        let out16re = b316re_a + b316re_b;
2499        let out16im = b316im_a - b316im_b;
2500        let out17re = b217re_a + b217re_b;
2501        let out17im = b217im_a - b217im_b;
2502        let out18re = b118re_a + b118re_b;
2503        let out18im = b118im_a - b118im_b;
2504        buffer.store(sum, 0);
2505        buffer.store(
2506            Complex {
2507                re: out1re,
2508                im: out1im,
2509            },
2510            1,
2511        );
2512        buffer.store(
2513            Complex {
2514                re: out2re,
2515                im: out2im,
2516            },
2517            2,
2518        );
2519        buffer.store(
2520            Complex {
2521                re: out3re,
2522                im: out3im,
2523            },
2524            3,
2525        );
2526        buffer.store(
2527            Complex {
2528                re: out4re,
2529                im: out4im,
2530            },
2531            4,
2532        );
2533        buffer.store(
2534            Complex {
2535                re: out5re,
2536                im: out5im,
2537            },
2538            5,
2539        );
2540        buffer.store(
2541            Complex {
2542                re: out6re,
2543                im: out6im,
2544            },
2545            6,
2546        );
2547        buffer.store(
2548            Complex {
2549                re: out7re,
2550                im: out7im,
2551            },
2552            7,
2553        );
2554        buffer.store(
2555            Complex {
2556                re: out8re,
2557                im: out8im,
2558            },
2559            8,
2560        );
2561        buffer.store(
2562            Complex {
2563                re: out9re,
2564                im: out9im,
2565            },
2566            9,
2567        );
2568        buffer.store(
2569            Complex {
2570                re: out10re,
2571                im: out10im,
2572            },
2573            10,
2574        );
2575        buffer.store(
2576            Complex {
2577                re: out11re,
2578                im: out11im,
2579            },
2580            11,
2581        );
2582        buffer.store(
2583            Complex {
2584                re: out12re,
2585                im: out12im,
2586            },
2587            12,
2588        );
2589        buffer.store(
2590            Complex {
2591                re: out13re,
2592                im: out13im,
2593            },
2594            13,
2595        );
2596        buffer.store(
2597            Complex {
2598                re: out14re,
2599                im: out14im,
2600            },
2601            14,
2602        );
2603        buffer.store(
2604            Complex {
2605                re: out15re,
2606                im: out15im,
2607            },
2608            15,
2609        );
2610        buffer.store(
2611            Complex {
2612                re: out16re,
2613                im: out16im,
2614            },
2615            16,
2616        );
2617        buffer.store(
2618            Complex {
2619                re: out17re,
2620                im: out17im,
2621            },
2622            17,
2623        );
2624        buffer.store(
2625            Complex {
2626                re: out18re,
2627                im: out18im,
2628            },
2629            18,
2630        );
2631    }
2632}
2633
2634pub struct Butterfly23<T> {
2635    twiddle1: Complex<T>,
2636    twiddle2: Complex<T>,
2637    twiddle3: Complex<T>,
2638    twiddle4: Complex<T>,
2639    twiddle5: Complex<T>,
2640    twiddle6: Complex<T>,
2641    twiddle7: Complex<T>,
2642    twiddle8: Complex<T>,
2643    twiddle9: Complex<T>,
2644    twiddle10: Complex<T>,
2645    twiddle11: Complex<T>,
2646    direction: FftDirection,
2647}
2648boilerplate_fft_butterfly!(Butterfly23, 23, |this: &Butterfly23<_>| this.direction);
2649impl<T: FftNum> Butterfly23<T> {
2650    pub fn new(direction: FftDirection) -> Self {
2651        let twiddle1: Complex<T> = twiddles::compute_twiddle(1, 23, direction);
2652        let twiddle2: Complex<T> = twiddles::compute_twiddle(2, 23, direction);
2653        let twiddle3: Complex<T> = twiddles::compute_twiddle(3, 23, direction);
2654        let twiddle4: Complex<T> = twiddles::compute_twiddle(4, 23, direction);
2655        let twiddle5: Complex<T> = twiddles::compute_twiddle(5, 23, direction);
2656        let twiddle6: Complex<T> = twiddles::compute_twiddle(6, 23, direction);
2657        let twiddle7: Complex<T> = twiddles::compute_twiddle(7, 23, direction);
2658        let twiddle8: Complex<T> = twiddles::compute_twiddle(8, 23, direction);
2659        let twiddle9: Complex<T> = twiddles::compute_twiddle(9, 23, direction);
2660        let twiddle10: Complex<T> = twiddles::compute_twiddle(10, 23, direction);
2661        let twiddle11: Complex<T> = twiddles::compute_twiddle(11, 23, direction);
2662        Self {
2663            twiddle1,
2664            twiddle2,
2665            twiddle3,
2666            twiddle4,
2667            twiddle5,
2668            twiddle6,
2669            twiddle7,
2670            twiddle8,
2671            twiddle9,
2672            twiddle10,
2673            twiddle11,
2674            direction,
2675        }
2676    }
2677
2678    #[inline(never)]
2679    unsafe fn perform_fft_contiguous(&self, mut buffer: impl LoadStore<T>) {
2680        // This function was derived in the same manner as the butterflies for length 3, 5 and 7.
2681        // However, instead of doing it by hand the actual code is autogenerated
2682        // with the `genbutterflies.py` script in the `tools` directory.
2683        let x122p = buffer.load(1) + buffer.load(22);
2684        let x122n = buffer.load(1) - buffer.load(22);
2685        let x221p = buffer.load(2) + buffer.load(21);
2686        let x221n = buffer.load(2) - buffer.load(21);
2687        let x320p = buffer.load(3) + buffer.load(20);
2688        let x320n = buffer.load(3) - buffer.load(20);
2689        let x419p = buffer.load(4) + buffer.load(19);
2690        let x419n = buffer.load(4) - buffer.load(19);
2691        let x518p = buffer.load(5) + buffer.load(18);
2692        let x518n = buffer.load(5) - buffer.load(18);
2693        let x617p = buffer.load(6) + buffer.load(17);
2694        let x617n = buffer.load(6) - buffer.load(17);
2695        let x716p = buffer.load(7) + buffer.load(16);
2696        let x716n = buffer.load(7) - buffer.load(16);
2697        let x815p = buffer.load(8) + buffer.load(15);
2698        let x815n = buffer.load(8) - buffer.load(15);
2699        let x914p = buffer.load(9) + buffer.load(14);
2700        let x914n = buffer.load(9) - buffer.load(14);
2701        let x1013p = buffer.load(10) + buffer.load(13);
2702        let x1013n = buffer.load(10) - buffer.load(13);
2703        let x1112p = buffer.load(11) + buffer.load(12);
2704        let x1112n = buffer.load(11) - buffer.load(12);
2705        let sum = buffer.load(0)
2706            + x122p
2707            + x221p
2708            + x320p
2709            + x419p
2710            + x518p
2711            + x617p
2712            + x716p
2713            + x815p
2714            + x914p
2715            + x1013p
2716            + x1112p;
2717        let b122re_a = buffer.load(0).re
2718            + self.twiddle1.re * x122p.re
2719            + self.twiddle2.re * x221p.re
2720            + self.twiddle3.re * x320p.re
2721            + self.twiddle4.re * x419p.re
2722            + self.twiddle5.re * x518p.re
2723            + self.twiddle6.re * x617p.re
2724            + self.twiddle7.re * x716p.re
2725            + self.twiddle8.re * x815p.re
2726            + self.twiddle9.re * x914p.re
2727            + self.twiddle10.re * x1013p.re
2728            + self.twiddle11.re * x1112p.re;
2729        let b122re_b = self.twiddle1.im * x122n.im
2730            + self.twiddle2.im * x221n.im
2731            + self.twiddle3.im * x320n.im
2732            + self.twiddle4.im * x419n.im
2733            + self.twiddle5.im * x518n.im
2734            + self.twiddle6.im * x617n.im
2735            + self.twiddle7.im * x716n.im
2736            + self.twiddle8.im * x815n.im
2737            + self.twiddle9.im * x914n.im
2738            + self.twiddle10.im * x1013n.im
2739            + self.twiddle11.im * x1112n.im;
2740        let b221re_a = buffer.load(0).re
2741            + self.twiddle2.re * x122p.re
2742            + self.twiddle4.re * x221p.re
2743            + self.twiddle6.re * x320p.re
2744            + self.twiddle8.re * x419p.re
2745            + self.twiddle10.re * x518p.re
2746            + self.twiddle11.re * x617p.re
2747            + self.twiddle9.re * x716p.re
2748            + self.twiddle7.re * x815p.re
2749            + self.twiddle5.re * x914p.re
2750            + self.twiddle3.re * x1013p.re
2751            + self.twiddle1.re * x1112p.re;
2752        let b221re_b = self.twiddle2.im * x122n.im
2753            + self.twiddle4.im * x221n.im
2754            + self.twiddle6.im * x320n.im
2755            + self.twiddle8.im * x419n.im
2756            + self.twiddle10.im * x518n.im
2757            + -self.twiddle11.im * x617n.im
2758            + -self.twiddle9.im * x716n.im
2759            + -self.twiddle7.im * x815n.im
2760            + -self.twiddle5.im * x914n.im
2761            + -self.twiddle3.im * x1013n.im
2762            + -self.twiddle1.im * x1112n.im;
2763        let b320re_a = buffer.load(0).re
2764            + self.twiddle3.re * x122p.re
2765            + self.twiddle6.re * x221p.re
2766            + self.twiddle9.re * x320p.re
2767            + self.twiddle11.re * x419p.re
2768            + self.twiddle8.re * x518p.re
2769            + self.twiddle5.re * x617p.re
2770            + self.twiddle2.re * x716p.re
2771            + self.twiddle1.re * x815p.re
2772            + self.twiddle4.re * x914p.re
2773            + self.twiddle7.re * x1013p.re
2774            + self.twiddle10.re * x1112p.re;
2775        let b320re_b = self.twiddle3.im * x122n.im
2776            + self.twiddle6.im * x221n.im
2777            + self.twiddle9.im * x320n.im
2778            + -self.twiddle11.im * x419n.im
2779            + -self.twiddle8.im * x518n.im
2780            + -self.twiddle5.im * x617n.im
2781            + -self.twiddle2.im * x716n.im
2782            + self.twiddle1.im * x815n.im
2783            + self.twiddle4.im * x914n.im
2784            + self.twiddle7.im * x1013n.im
2785            + self.twiddle10.im * x1112n.im;
2786        let b419re_a = buffer.load(0).re
2787            + self.twiddle4.re * x122p.re
2788            + self.twiddle8.re * x221p.re
2789            + self.twiddle11.re * x320p.re
2790            + self.twiddle7.re * x419p.re
2791            + self.twiddle3.re * x518p.re
2792            + self.twiddle1.re * x617p.re
2793            + self.twiddle5.re * x716p.re
2794            + self.twiddle9.re * x815p.re
2795            + self.twiddle10.re * x914p.re
2796            + self.twiddle6.re * x1013p.re
2797            + self.twiddle2.re * x1112p.re;
2798        let b419re_b = self.twiddle4.im * x122n.im
2799            + self.twiddle8.im * x221n.im
2800            + -self.twiddle11.im * x320n.im
2801            + -self.twiddle7.im * x419n.im
2802            + -self.twiddle3.im * x518n.im
2803            + self.twiddle1.im * x617n.im
2804            + self.twiddle5.im * x716n.im
2805            + self.twiddle9.im * x815n.im
2806            + -self.twiddle10.im * x914n.im
2807            + -self.twiddle6.im * x1013n.im
2808            + -self.twiddle2.im * x1112n.im;
2809        let b518re_a = buffer.load(0).re
2810            + self.twiddle5.re * x122p.re
2811            + self.twiddle10.re * x221p.re
2812            + self.twiddle8.re * x320p.re
2813            + self.twiddle3.re * x419p.re
2814            + self.twiddle2.re * x518p.re
2815            + self.twiddle7.re * x617p.re
2816            + self.twiddle11.re * x716p.re
2817            + self.twiddle6.re * x815p.re
2818            + self.twiddle1.re * x914p.re
2819            + self.twiddle4.re * x1013p.re
2820            + self.twiddle9.re * x1112p.re;
2821        let b518re_b = self.twiddle5.im * x122n.im
2822            + self.twiddle10.im * x221n.im
2823            + -self.twiddle8.im * x320n.im
2824            + -self.twiddle3.im * x419n.im
2825            + self.twiddle2.im * x518n.im
2826            + self.twiddle7.im * x617n.im
2827            + -self.twiddle11.im * x716n.im
2828            + -self.twiddle6.im * x815n.im
2829            + -self.twiddle1.im * x914n.im
2830            + self.twiddle4.im * x1013n.im
2831            + self.twiddle9.im * x1112n.im;
2832        let b617re_a = buffer.load(0).re
2833            + self.twiddle6.re * x122p.re
2834            + self.twiddle11.re * x221p.re
2835            + self.twiddle5.re * x320p.re
2836            + self.twiddle1.re * x419p.re
2837            + self.twiddle7.re * x518p.re
2838            + self.twiddle10.re * x617p.re
2839            + self.twiddle4.re * x716p.re
2840            + self.twiddle2.re * x815p.re
2841            + self.twiddle8.re * x914p.re
2842            + self.twiddle9.re * x1013p.re
2843            + self.twiddle3.re * x1112p.re;
2844        let b617re_b = self.twiddle6.im * x122n.im
2845            + -self.twiddle11.im * x221n.im
2846            + -self.twiddle5.im * x320n.im
2847            + self.twiddle1.im * x419n.im
2848            + self.twiddle7.im * x518n.im
2849            + -self.twiddle10.im * x617n.im
2850            + -self.twiddle4.im * x716n.im
2851            + self.twiddle2.im * x815n.im
2852            + self.twiddle8.im * x914n.im
2853            + -self.twiddle9.im * x1013n.im
2854            + -self.twiddle3.im * x1112n.im;
2855        let b716re_a = buffer.load(0).re
2856            + self.twiddle7.re * x122p.re
2857            + self.twiddle9.re * x221p.re
2858            + self.twiddle2.re * x320p.re
2859            + self.twiddle5.re * x419p.re
2860            + self.twiddle11.re * x518p.re
2861            + self.twiddle4.re * x617p.re
2862            + self.twiddle3.re * x716p.re
2863            + self.twiddle10.re * x815p.re
2864            + self.twiddle6.re * x914p.re
2865            + self.twiddle1.re * x1013p.re
2866            + self.twiddle8.re * x1112p.re;
2867        let b716re_b = self.twiddle7.im * x122n.im
2868            + -self.twiddle9.im * x221n.im
2869            + -self.twiddle2.im * x320n.im
2870            + self.twiddle5.im * x419n.im
2871            + -self.twiddle11.im * x518n.im
2872            + -self.twiddle4.im * x617n.im
2873            + self.twiddle3.im * x716n.im
2874            + self.twiddle10.im * x815n.im
2875            + -self.twiddle6.im * x914n.im
2876            + self.twiddle1.im * x1013n.im
2877            + self.twiddle8.im * x1112n.im;
2878        let b815re_a = buffer.load(0).re
2879            + self.twiddle8.re * x122p.re
2880            + self.twiddle7.re * x221p.re
2881            + self.twiddle1.re * x320p.re
2882            + self.twiddle9.re * x419p.re
2883            + self.twiddle6.re * x518p.re
2884            + self.twiddle2.re * x617p.re
2885            + self.twiddle10.re * x716p.re
2886            + self.twiddle5.re * x815p.re
2887            + self.twiddle3.re * x914p.re
2888            + self.twiddle11.re * x1013p.re
2889            + self.twiddle4.re * x1112p.re;
2890        let b815re_b = self.twiddle8.im * x122n.im
2891            + -self.twiddle7.im * x221n.im
2892            + self.twiddle1.im * x320n.im
2893            + self.twiddle9.im * x419n.im
2894            + -self.twiddle6.im * x518n.im
2895            + self.twiddle2.im * x617n.im
2896            + self.twiddle10.im * x716n.im
2897            + -self.twiddle5.im * x815n.im
2898            + self.twiddle3.im * x914n.im
2899            + self.twiddle11.im * x1013n.im
2900            + -self.twiddle4.im * x1112n.im;
2901        let b914re_a = buffer.load(0).re
2902            + self.twiddle9.re * x122p.re
2903            + self.twiddle5.re * x221p.re
2904            + self.twiddle4.re * x320p.re
2905            + self.twiddle10.re * x419p.re
2906            + self.twiddle1.re * x518p.re
2907            + self.twiddle8.re * x617p.re
2908            + self.twiddle6.re * x716p.re
2909            + self.twiddle3.re * x815p.re
2910            + self.twiddle11.re * x914p.re
2911            + self.twiddle2.re * x1013p.re
2912            + self.twiddle7.re * x1112p.re;
2913        let b914re_b = self.twiddle9.im * x122n.im
2914            + -self.twiddle5.im * x221n.im
2915            + self.twiddle4.im * x320n.im
2916            + -self.twiddle10.im * x419n.im
2917            + -self.twiddle1.im * x518n.im
2918            + self.twiddle8.im * x617n.im
2919            + -self.twiddle6.im * x716n.im
2920            + self.twiddle3.im * x815n.im
2921            + -self.twiddle11.im * x914n.im
2922            + -self.twiddle2.im * x1013n.im
2923            + self.twiddle7.im * x1112n.im;
2924        let b1013re_a = buffer.load(0).re
2925            + self.twiddle10.re * x122p.re
2926            + self.twiddle3.re * x221p.re
2927            + self.twiddle7.re * x320p.re
2928            + self.twiddle6.re * x419p.re
2929            + self.twiddle4.re * x518p.re
2930            + self.twiddle9.re * x617p.re
2931            + self.twiddle1.re * x716p.re
2932            + self.twiddle11.re * x815p.re
2933            + self.twiddle2.re * x914p.re
2934            + self.twiddle8.re * x1013p.re
2935            + self.twiddle5.re * x1112p.re;
2936        let b1013re_b = self.twiddle10.im * x122n.im
2937            + -self.twiddle3.im * x221n.im
2938            + self.twiddle7.im * x320n.im
2939            + -self.twiddle6.im * x419n.im
2940            + self.twiddle4.im * x518n.im
2941            + -self.twiddle9.im * x617n.im
2942            + self.twiddle1.im * x716n.im
2943            + self.twiddle11.im * x815n.im
2944            + -self.twiddle2.im * x914n.im
2945            + self.twiddle8.im * x1013n.im
2946            + -self.twiddle5.im * x1112n.im;
2947        let b1112re_a = buffer.load(0).re
2948            + self.twiddle11.re * x122p.re
2949            + self.twiddle1.re * x221p.re
2950            + self.twiddle10.re * x320p.re
2951            + self.twiddle2.re * x419p.re
2952            + self.twiddle9.re * x518p.re
2953            + self.twiddle3.re * x617p.re
2954            + self.twiddle8.re * x716p.re
2955            + self.twiddle4.re * x815p.re
2956            + self.twiddle7.re * x914p.re
2957            + self.twiddle5.re * x1013p.re
2958            + self.twiddle6.re * x1112p.re;
2959        let b1112re_b = self.twiddle11.im * x122n.im
2960            + -self.twiddle1.im * x221n.im
2961            + self.twiddle10.im * x320n.im
2962            + -self.twiddle2.im * x419n.im
2963            + self.twiddle9.im * x518n.im
2964            + -self.twiddle3.im * x617n.im
2965            + self.twiddle8.im * x716n.im
2966            + -self.twiddle4.im * x815n.im
2967            + self.twiddle7.im * x914n.im
2968            + -self.twiddle5.im * x1013n.im
2969            + self.twiddle6.im * x1112n.im;
2970
2971        let b122im_a = buffer.load(0).im
2972            + self.twiddle1.re * x122p.im
2973            + self.twiddle2.re * x221p.im
2974            + self.twiddle3.re * x320p.im
2975            + self.twiddle4.re * x419p.im
2976            + self.twiddle5.re * x518p.im
2977            + self.twiddle6.re * x617p.im
2978            + self.twiddle7.re * x716p.im
2979            + self.twiddle8.re * x815p.im
2980            + self.twiddle9.re * x914p.im
2981            + self.twiddle10.re * x1013p.im
2982            + self.twiddle11.re * x1112p.im;
2983        let b122im_b = self.twiddle1.im * x122n.re
2984            + self.twiddle2.im * x221n.re
2985            + self.twiddle3.im * x320n.re
2986            + self.twiddle4.im * x419n.re
2987            + self.twiddle5.im * x518n.re
2988            + self.twiddle6.im * x617n.re
2989            + self.twiddle7.im * x716n.re
2990            + self.twiddle8.im * x815n.re
2991            + self.twiddle9.im * x914n.re
2992            + self.twiddle10.im * x1013n.re
2993            + self.twiddle11.im * x1112n.re;
2994        let b221im_a = buffer.load(0).im
2995            + self.twiddle2.re * x122p.im
2996            + self.twiddle4.re * x221p.im
2997            + self.twiddle6.re * x320p.im
2998            + self.twiddle8.re * x419p.im
2999            + self.twiddle10.re * x518p.im
3000            + self.twiddle11.re * x617p.im
3001            + self.twiddle9.re * x716p.im
3002            + self.twiddle7.re * x815p.im
3003            + self.twiddle5.re * x914p.im
3004            + self.twiddle3.re * x1013p.im
3005            + self.twiddle1.re * x1112p.im;
3006        let b221im_b = self.twiddle2.im * x122n.re
3007            + self.twiddle4.im * x221n.re
3008            + self.twiddle6.im * x320n.re
3009            + self.twiddle8.im * x419n.re
3010            + self.twiddle10.im * x518n.re
3011            + -self.twiddle11.im * x617n.re
3012            + -self.twiddle9.im * x716n.re
3013            + -self.twiddle7.im * x815n.re
3014            + -self.twiddle5.im * x914n.re
3015            + -self.twiddle3.im * x1013n.re
3016            + -self.twiddle1.im * x1112n.re;
3017        let b320im_a = buffer.load(0).im
3018            + self.twiddle3.re * x122p.im
3019            + self.twiddle6.re * x221p.im
3020            + self.twiddle9.re * x320p.im
3021            + self.twiddle11.re * x419p.im
3022            + self.twiddle8.re * x518p.im
3023            + self.twiddle5.re * x617p.im
3024            + self.twiddle2.re * x716p.im
3025            + self.twiddle1.re * x815p.im
3026            + self.twiddle4.re * x914p.im
3027            + self.twiddle7.re * x1013p.im
3028            + self.twiddle10.re * x1112p.im;
3029        let b320im_b = self.twiddle3.im * x122n.re
3030            + self.twiddle6.im * x221n.re
3031            + self.twiddle9.im * x320n.re
3032            + -self.twiddle11.im * x419n.re
3033            + -self.twiddle8.im * x518n.re
3034            + -self.twiddle5.im * x617n.re
3035            + -self.twiddle2.im * x716n.re
3036            + self.twiddle1.im * x815n.re
3037            + self.twiddle4.im * x914n.re
3038            + self.twiddle7.im * x1013n.re
3039            + self.twiddle10.im * x1112n.re;
3040        let b419im_a = buffer.load(0).im
3041            + self.twiddle4.re * x122p.im
3042            + self.twiddle8.re * x221p.im
3043            + self.twiddle11.re * x320p.im
3044            + self.twiddle7.re * x419p.im
3045            + self.twiddle3.re * x518p.im
3046            + self.twiddle1.re * x617p.im
3047            + self.twiddle5.re * x716p.im
3048            + self.twiddle9.re * x815p.im
3049            + self.twiddle10.re * x914p.im
3050            + self.twiddle6.re * x1013p.im
3051            + self.twiddle2.re * x1112p.im;
3052        let b419im_b = self.twiddle4.im * x122n.re
3053            + self.twiddle8.im * x221n.re
3054            + -self.twiddle11.im * x320n.re
3055            + -self.twiddle7.im * x419n.re
3056            + -self.twiddle3.im * x518n.re
3057            + self.twiddle1.im * x617n.re
3058            + self.twiddle5.im * x716n.re
3059            + self.twiddle9.im * x815n.re
3060            + -self.twiddle10.im * x914n.re
3061            + -self.twiddle6.im * x1013n.re
3062            + -self.twiddle2.im * x1112n.re;
3063        let b518im_a = buffer.load(0).im
3064            + self.twiddle5.re * x122p.im
3065            + self.twiddle10.re * x221p.im
3066            + self.twiddle8.re * x320p.im
3067            + self.twiddle3.re * x419p.im
3068            + self.twiddle2.re * x518p.im
3069            + self.twiddle7.re * x617p.im
3070            + self.twiddle11.re * x716p.im
3071            + self.twiddle6.re * x815p.im
3072            + self.twiddle1.re * x914p.im
3073            + self.twiddle4.re * x1013p.im
3074            + self.twiddle9.re * x1112p.im;
3075        let b518im_b = self.twiddle5.im * x122n.re
3076            + self.twiddle10.im * x221n.re
3077            + -self.twiddle8.im * x320n.re
3078            + -self.twiddle3.im * x419n.re
3079            + self.twiddle2.im * x518n.re
3080            + self.twiddle7.im * x617n.re
3081            + -self.twiddle11.im * x716n.re
3082            + -self.twiddle6.im * x815n.re
3083            + -self.twiddle1.im * x914n.re
3084            + self.twiddle4.im * x1013n.re
3085            + self.twiddle9.im * x1112n.re;
3086        let b617im_a = buffer.load(0).im
3087            + self.twiddle6.re * x122p.im
3088            + self.twiddle11.re * x221p.im
3089            + self.twiddle5.re * x320p.im
3090            + self.twiddle1.re * x419p.im
3091            + self.twiddle7.re * x518p.im
3092            + self.twiddle10.re * x617p.im
3093            + self.twiddle4.re * x716p.im
3094            + self.twiddle2.re * x815p.im
3095            + self.twiddle8.re * x914p.im
3096            + self.twiddle9.re * x1013p.im
3097            + self.twiddle3.re * x1112p.im;
3098        let b617im_b = self.twiddle6.im * x122n.re
3099            + -self.twiddle11.im * x221n.re
3100            + -self.twiddle5.im * x320n.re
3101            + self.twiddle1.im * x419n.re
3102            + self.twiddle7.im * x518n.re
3103            + -self.twiddle10.im * x617n.re
3104            + -self.twiddle4.im * x716n.re
3105            + self.twiddle2.im * x815n.re
3106            + self.twiddle8.im * x914n.re
3107            + -self.twiddle9.im * x1013n.re
3108            + -self.twiddle3.im * x1112n.re;
3109        let b716im_a = buffer.load(0).im
3110            + self.twiddle7.re * x122p.im
3111            + self.twiddle9.re * x221p.im
3112            + self.twiddle2.re * x320p.im
3113            + self.twiddle5.re * x419p.im
3114            + self.twiddle11.re * x518p.im
3115            + self.twiddle4.re * x617p.im
3116            + self.twiddle3.re * x716p.im
3117            + self.twiddle10.re * x815p.im
3118            + self.twiddle6.re * x914p.im
3119            + self.twiddle1.re * x1013p.im
3120            + self.twiddle8.re * x1112p.im;
3121        let b716im_b = self.twiddle7.im * x122n.re
3122            + -self.twiddle9.im * x221n.re
3123            + -self.twiddle2.im * x320n.re
3124            + self.twiddle5.im * x419n.re
3125            + -self.twiddle11.im * x518n.re
3126            + -self.twiddle4.im * x617n.re
3127            + self.twiddle3.im * x716n.re
3128            + self.twiddle10.im * x815n.re
3129            + -self.twiddle6.im * x914n.re
3130            + self.twiddle1.im * x1013n.re
3131            + self.twiddle8.im * x1112n.re;
3132        let b815im_a = buffer.load(0).im
3133            + self.twiddle8.re * x122p.im
3134            + self.twiddle7.re * x221p.im
3135            + self.twiddle1.re * x320p.im
3136            + self.twiddle9.re * x419p.im
3137            + self.twiddle6.re * x518p.im
3138            + self.twiddle2.re * x617p.im
3139            + self.twiddle10.re * x716p.im
3140            + self.twiddle5.re * x815p.im
3141            + self.twiddle3.re * x914p.im
3142            + self.twiddle11.re * x1013p.im
3143            + self.twiddle4.re * x1112p.im;
3144        let b815im_b = self.twiddle8.im * x122n.re
3145            + -self.twiddle7.im * x221n.re
3146            + self.twiddle1.im * x320n.re
3147            + self.twiddle9.im * x419n.re
3148            + -self.twiddle6.im * x518n.re
3149            + self.twiddle2.im * x617n.re
3150            + self.twiddle10.im * x716n.re
3151            + -self.twiddle5.im * x815n.re
3152            + self.twiddle3.im * x914n.re
3153            + self.twiddle11.im * x1013n.re
3154            + -self.twiddle4.im * x1112n.re;
3155        let b914im_a = buffer.load(0).im
3156            + self.twiddle9.re * x122p.im
3157            + self.twiddle5.re * x221p.im
3158            + self.twiddle4.re * x320p.im
3159            + self.twiddle10.re * x419p.im
3160            + self.twiddle1.re * x518p.im
3161            + self.twiddle8.re * x617p.im
3162            + self.twiddle6.re * x716p.im
3163            + self.twiddle3.re * x815p.im
3164            + self.twiddle11.re * x914p.im
3165            + self.twiddle2.re * x1013p.im
3166            + self.twiddle7.re * x1112p.im;
3167        let b914im_b = self.twiddle9.im * x122n.re
3168            + -self.twiddle5.im * x221n.re
3169            + self.twiddle4.im * x320n.re
3170            + -self.twiddle10.im * x419n.re
3171            + -self.twiddle1.im * x518n.re
3172            + self.twiddle8.im * x617n.re
3173            + -self.twiddle6.im * x716n.re
3174            + self.twiddle3.im * x815n.re
3175            + -self.twiddle11.im * x914n.re
3176            + -self.twiddle2.im * x1013n.re
3177            + self.twiddle7.im * x1112n.re;
3178        let b1013im_a = buffer.load(0).im
3179            + self.twiddle10.re * x122p.im
3180            + self.twiddle3.re * x221p.im
3181            + self.twiddle7.re * x320p.im
3182            + self.twiddle6.re * x419p.im
3183            + self.twiddle4.re * x518p.im
3184            + self.twiddle9.re * x617p.im
3185            + self.twiddle1.re * x716p.im
3186            + self.twiddle11.re * x815p.im
3187            + self.twiddle2.re * x914p.im
3188            + self.twiddle8.re * x1013p.im
3189            + self.twiddle5.re * x1112p.im;
3190        let b1013im_b = self.twiddle10.im * x122n.re
3191            + -self.twiddle3.im * x221n.re
3192            + self.twiddle7.im * x320n.re
3193            + -self.twiddle6.im * x419n.re
3194            + self.twiddle4.im * x518n.re
3195            + -self.twiddle9.im * x617n.re
3196            + self.twiddle1.im * x716n.re
3197            + self.twiddle11.im * x815n.re
3198            + -self.twiddle2.im * x914n.re
3199            + self.twiddle8.im * x1013n.re
3200            + -self.twiddle5.im * x1112n.re;
3201        let b1112im_a = buffer.load(0).im
3202            + self.twiddle11.re * x122p.im
3203            + self.twiddle1.re * x221p.im
3204            + self.twiddle10.re * x320p.im
3205            + self.twiddle2.re * x419p.im
3206            + self.twiddle9.re * x518p.im
3207            + self.twiddle3.re * x617p.im
3208            + self.twiddle8.re * x716p.im
3209            + self.twiddle4.re * x815p.im
3210            + self.twiddle7.re * x914p.im
3211            + self.twiddle5.re * x1013p.im
3212            + self.twiddle6.re * x1112p.im;
3213        let b1112im_b = self.twiddle11.im * x122n.re
3214            + -self.twiddle1.im * x221n.re
3215            + self.twiddle10.im * x320n.re
3216            + -self.twiddle2.im * x419n.re
3217            + self.twiddle9.im * x518n.re
3218            + -self.twiddle3.im * x617n.re
3219            + self.twiddle8.im * x716n.re
3220            + -self.twiddle4.im * x815n.re
3221            + self.twiddle7.im * x914n.re
3222            + -self.twiddle5.im * x1013n.re
3223            + self.twiddle6.im * x1112n.re;
3224
3225        let out1re = b122re_a - b122re_b;
3226        let out1im = b122im_a + b122im_b;
3227        let out2re = b221re_a - b221re_b;
3228        let out2im = b221im_a + b221im_b;
3229        let out3re = b320re_a - b320re_b;
3230        let out3im = b320im_a + b320im_b;
3231        let out4re = b419re_a - b419re_b;
3232        let out4im = b419im_a + b419im_b;
3233        let out5re = b518re_a - b518re_b;
3234        let out5im = b518im_a + b518im_b;
3235        let out6re = b617re_a - b617re_b;
3236        let out6im = b617im_a + b617im_b;
3237        let out7re = b716re_a - b716re_b;
3238        let out7im = b716im_a + b716im_b;
3239        let out8re = b815re_a - b815re_b;
3240        let out8im = b815im_a + b815im_b;
3241        let out9re = b914re_a - b914re_b;
3242        let out9im = b914im_a + b914im_b;
3243        let out10re = b1013re_a - b1013re_b;
3244        let out10im = b1013im_a + b1013im_b;
3245        let out11re = b1112re_a - b1112re_b;
3246        let out11im = b1112im_a + b1112im_b;
3247        let out12re = b1112re_a + b1112re_b;
3248        let out12im = b1112im_a - b1112im_b;
3249        let out13re = b1013re_a + b1013re_b;
3250        let out13im = b1013im_a - b1013im_b;
3251        let out14re = b914re_a + b914re_b;
3252        let out14im = b914im_a - b914im_b;
3253        let out15re = b815re_a + b815re_b;
3254        let out15im = b815im_a - b815im_b;
3255        let out16re = b716re_a + b716re_b;
3256        let out16im = b716im_a - b716im_b;
3257        let out17re = b617re_a + b617re_b;
3258        let out17im = b617im_a - b617im_b;
3259        let out18re = b518re_a + b518re_b;
3260        let out18im = b518im_a - b518im_b;
3261        let out19re = b419re_a + b419re_b;
3262        let out19im = b419im_a - b419im_b;
3263        let out20re = b320re_a + b320re_b;
3264        let out20im = b320im_a - b320im_b;
3265        let out21re = b221re_a + b221re_b;
3266        let out21im = b221im_a - b221im_b;
3267        let out22re = b122re_a + b122re_b;
3268        let out22im = b122im_a - b122im_b;
3269        buffer.store(sum, 0);
3270        buffer.store(
3271            Complex {
3272                re: out1re,
3273                im: out1im,
3274            },
3275            1,
3276        );
3277        buffer.store(
3278            Complex {
3279                re: out2re,
3280                im: out2im,
3281            },
3282            2,
3283        );
3284        buffer.store(
3285            Complex {
3286                re: out3re,
3287                im: out3im,
3288            },
3289            3,
3290        );
3291        buffer.store(
3292            Complex {
3293                re: out4re,
3294                im: out4im,
3295            },
3296            4,
3297        );
3298        buffer.store(
3299            Complex {
3300                re: out5re,
3301                im: out5im,
3302            },
3303            5,
3304        );
3305        buffer.store(
3306            Complex {
3307                re: out6re,
3308                im: out6im,
3309            },
3310            6,
3311        );
3312        buffer.store(
3313            Complex {
3314                re: out7re,
3315                im: out7im,
3316            },
3317            7,
3318        );
3319        buffer.store(
3320            Complex {
3321                re: out8re,
3322                im: out8im,
3323            },
3324            8,
3325        );
3326        buffer.store(
3327            Complex {
3328                re: out9re,
3329                im: out9im,
3330            },
3331            9,
3332        );
3333        buffer.store(
3334            Complex {
3335                re: out10re,
3336                im: out10im,
3337            },
3338            10,
3339        );
3340        buffer.store(
3341            Complex {
3342                re: out11re,
3343                im: out11im,
3344            },
3345            11,
3346        );
3347        buffer.store(
3348            Complex {
3349                re: out12re,
3350                im: out12im,
3351            },
3352            12,
3353        );
3354        buffer.store(
3355            Complex {
3356                re: out13re,
3357                im: out13im,
3358            },
3359            13,
3360        );
3361        buffer.store(
3362            Complex {
3363                re: out14re,
3364                im: out14im,
3365            },
3366            14,
3367        );
3368        buffer.store(
3369            Complex {
3370                re: out15re,
3371                im: out15im,
3372            },
3373            15,
3374        );
3375        buffer.store(
3376            Complex {
3377                re: out16re,
3378                im: out16im,
3379            },
3380            16,
3381        );
3382        buffer.store(
3383            Complex {
3384                re: out17re,
3385                im: out17im,
3386            },
3387            17,
3388        );
3389        buffer.store(
3390            Complex {
3391                re: out18re,
3392                im: out18im,
3393            },
3394            18,
3395        );
3396        buffer.store(
3397            Complex {
3398                re: out19re,
3399                im: out19im,
3400            },
3401            19,
3402        );
3403        buffer.store(
3404            Complex {
3405                re: out20re,
3406                im: out20im,
3407            },
3408            20,
3409        );
3410        buffer.store(
3411            Complex {
3412                re: out21re,
3413                im: out21im,
3414            },
3415            21,
3416        );
3417        buffer.store(
3418            Complex {
3419                re: out22re,
3420                im: out22im,
3421            },
3422            22,
3423        );
3424    }
3425}
3426
3427pub struct Butterfly24<T> {
3428    butterfly4: Butterfly4<T>,
3429    butterfly6: Butterfly6<T>,
3430    twiddle1: Complex<T>,
3431    twiddle2: Complex<T>,
3432    twiddle4: Complex<T>,
3433    twiddle5: Complex<T>,
3434    twiddle8: Complex<T>,
3435    twiddle10: Complex<T>,
3436    root2: T,
3437}
3438boilerplate_fft_butterfly!(Butterfly24, 24, |this: &Butterfly24<_>| this
3439    .butterfly4
3440    .fft_direction());
3441impl<T: FftNum> Butterfly24<T> {
3442    #[inline(always)]
3443    pub fn new(direction: FftDirection) -> Self {
3444        Self {
3445            butterfly4: Butterfly4::new(direction),
3446            butterfly6: Butterfly6::new(direction),
3447            twiddle1: twiddles::compute_twiddle(1, 24, direction),
3448            twiddle2: twiddles::compute_twiddle(2, 24, direction),
3449            twiddle4: twiddles::compute_twiddle(4, 24, direction),
3450            twiddle5: twiddles::compute_twiddle(5, 24, direction),
3451            twiddle8: twiddles::compute_twiddle(8, 24, direction),
3452            twiddle10: twiddles::compute_twiddle(10, 24, direction),
3453            root2: T::from_f64(0.5f64.sqrt()).unwrap(),
3454        }
3455    }
3456    #[inline(never)]
3457    unsafe fn perform_fft_contiguous(&self, mut buffer: impl LoadStore<T>) {
3458        // algorithm: 6x4 mixed radix
3459        // step 1: transpose the input directly into the scratch.
3460        let mut scratch0 = [
3461            buffer.load(0),
3462            buffer.load(4),
3463            buffer.load(8),
3464            buffer.load(12),
3465            buffer.load(16),
3466            buffer.load(20),
3467        ];
3468        let mut scratch1 = [
3469            buffer.load(1),
3470            buffer.load(5),
3471            buffer.load(9),
3472            buffer.load(13),
3473            buffer.load(17),
3474            buffer.load(21),
3475        ];
3476        let mut scratch2 = [
3477            buffer.load(2),
3478            buffer.load(6),
3479            buffer.load(10),
3480            buffer.load(14),
3481            buffer.load(18),
3482            buffer.load(22),
3483        ];
3484        let mut scratch3 = [
3485            buffer.load(3),
3486            buffer.load(7),
3487            buffer.load(11),
3488            buffer.load(15),
3489            buffer.load(19),
3490            buffer.load(23),
3491        ];
3492
3493        // step 2: column FFTs
3494        self.butterfly6.perform_fft_contiguous(&mut scratch0);
3495        self.butterfly6.perform_fft_contiguous(&mut scratch1);
3496        self.butterfly6.perform_fft_contiguous(&mut scratch2);
3497        self.butterfly6.perform_fft_contiguous(&mut scratch3);
3498
3499        // step 3: apply twiddle factors
3500        scratch1[1] = scratch1[1] * self.twiddle1;
3501        scratch1[2] = scratch1[2] * self.twiddle2;
3502        scratch1[3] =
3503            (twiddles::rotate_90(scratch1[3], self.fft_direction()) + scratch1[3]) * self.root2;
3504        scratch1[4] = scratch1[4] * self.twiddle4;
3505        scratch1[5] = scratch1[5] * self.twiddle5;
3506        scratch2[1] = scratch2[1] * self.twiddle2;
3507        scratch2[2] = scratch2[2] * self.twiddle4;
3508        scratch2[3] = twiddles::rotate_90(scratch2[3], self.fft_direction());
3509        scratch2[4] = scratch2[4] * self.twiddle8;
3510        scratch2[5] = scratch2[5] * self.twiddle10;
3511        scratch3[1] =
3512            (twiddles::rotate_90(scratch3[1], self.fft_direction()) + scratch3[1]) * self.root2;
3513        scratch3[2] = twiddles::rotate_90(scratch3[2], self.fft_direction());
3514        scratch3[3] =
3515            (twiddles::rotate_90(scratch3[3], self.fft_direction()) - scratch3[3]) * self.root2;
3516        scratch3[4] = -scratch3[4];
3517        scratch3[5] =
3518            (twiddles::rotate_90(scratch3[5], self.fft_direction()) + scratch3[5]) * -self.root2;
3519
3520        // step 4: SKIPPED because the next FFTs will be non-contiguous
3521
3522        // step 5: row FFTs
3523        self.butterfly4.perform_fft_strided(
3524            &mut scratch0[0],
3525            &mut scratch1[0],
3526            &mut scratch2[0],
3527            &mut scratch3[0],
3528        );
3529        self.butterfly4.perform_fft_strided(
3530            &mut scratch0[1],
3531            &mut scratch1[1],
3532            &mut scratch2[1],
3533            &mut scratch3[1],
3534        );
3535        self.butterfly4.perform_fft_strided(
3536            &mut scratch0[2],
3537            &mut scratch1[2],
3538            &mut scratch2[2],
3539            &mut scratch3[2],
3540        );
3541        self.butterfly4.perform_fft_strided(
3542            &mut scratch0[3],
3543            &mut scratch1[3],
3544            &mut scratch2[3],
3545            &mut scratch3[3],
3546        );
3547        self.butterfly4.perform_fft_strided(
3548            &mut scratch0[4],
3549            &mut scratch1[4],
3550            &mut scratch2[4],
3551            &mut scratch3[4],
3552        );
3553        self.butterfly4.perform_fft_strided(
3554            &mut scratch0[5],
3555            &mut scratch1[5],
3556            &mut scratch2[5],
3557            &mut scratch3[5],
3558        );
3559
3560        // step 6: copy back to the buffer. we can skip the transpose, because we skipped step 4
3561        buffer.store(scratch0[0], 0);
3562        buffer.store(scratch0[1], 1);
3563        buffer.store(scratch0[2], 2);
3564        buffer.store(scratch0[3], 3);
3565        buffer.store(scratch0[4], 4);
3566        buffer.store(scratch0[5], 5);
3567        buffer.store(scratch1[0], 6);
3568        buffer.store(scratch1[1], 7);
3569        buffer.store(scratch1[2], 8);
3570        buffer.store(scratch1[3], 9);
3571        buffer.store(scratch1[4], 10);
3572        buffer.store(scratch1[5], 11);
3573        buffer.store(scratch2[0], 12);
3574        buffer.store(scratch2[1], 13);
3575        buffer.store(scratch2[2], 14);
3576        buffer.store(scratch2[3], 15);
3577        buffer.store(scratch2[4], 16);
3578        buffer.store(scratch2[5], 17);
3579        buffer.store(scratch3[0], 18);
3580        buffer.store(scratch3[1], 19);
3581        buffer.store(scratch3[2], 20);
3582        buffer.store(scratch3[3], 21);
3583        buffer.store(scratch3[4], 22);
3584        buffer.store(scratch3[5], 23);
3585    }
3586}
3587
3588pub struct Butterfly27<T> {
3589    butterfly9: Butterfly9<T>,
3590    twiddles: [Complex<T>; 12],
3591}
3592boilerplate_fft_butterfly!(Butterfly27, 27, |this: &Butterfly27<_>| this
3593    .butterfly9
3594    .fft_direction());
3595impl<T: FftNum> Butterfly27<T> {
3596    #[inline(always)]
3597    pub fn new(direction: FftDirection) -> Self {
3598        Self {
3599            butterfly9: Butterfly9::new(direction),
3600            twiddles: [
3601                twiddles::compute_twiddle(1, 27, direction),
3602                twiddles::compute_twiddle(2, 27, direction),
3603                twiddles::compute_twiddle(3, 27, direction),
3604                twiddles::compute_twiddle(4, 27, direction),
3605                twiddles::compute_twiddle(5, 27, direction),
3606                twiddles::compute_twiddle(6, 27, direction),
3607                twiddles::compute_twiddle(7, 27, direction),
3608                twiddles::compute_twiddle(8, 27, direction),
3609                twiddles::compute_twiddle(10, 27, direction),
3610                twiddles::compute_twiddle(12, 27, direction),
3611                twiddles::compute_twiddle(14, 27, direction),
3612                twiddles::compute_twiddle(16, 27, direction),
3613            ],
3614        }
3615    }
3616
3617    #[inline(always)]
3618    unsafe fn perform_fft_contiguous(&self, mut buffer: impl LoadStore<T>) {
3619        // algorithm: mixed radix with width=9 and height=3
3620
3621        // step 1: transpose the input into the scratch
3622        let mut scratch0 = [
3623            buffer.load(0),
3624            buffer.load(3),
3625            buffer.load(6),
3626            buffer.load(9),
3627            buffer.load(12),
3628            buffer.load(15),
3629            buffer.load(18),
3630            buffer.load(21),
3631            buffer.load(24),
3632        ];
3633        let mut scratch1 = [
3634            buffer.load(1 + 0),
3635            buffer.load(1 + 3),
3636            buffer.load(1 + 6),
3637            buffer.load(1 + 9),
3638            buffer.load(1 + 12),
3639            buffer.load(1 + 15),
3640            buffer.load(1 + 18),
3641            buffer.load(1 + 21),
3642            buffer.load(1 + 24),
3643        ];
3644        let mut scratch2 = [
3645            buffer.load(2 + 0),
3646            buffer.load(2 + 3),
3647            buffer.load(2 + 6),
3648            buffer.load(2 + 9),
3649            buffer.load(2 + 12),
3650            buffer.load(2 + 15),
3651            buffer.load(2 + 18),
3652            buffer.load(2 + 21),
3653            buffer.load(2 + 24),
3654        ];
3655
3656        // step 2: column FFTs
3657        self.butterfly9.perform_fft_contiguous(&mut scratch0);
3658        self.butterfly9.perform_fft_contiguous(&mut scratch1);
3659        self.butterfly9.perform_fft_contiguous(&mut scratch2);
3660
3661        // step 3: apply twiddle factors
3662        scratch1[1] = scratch1[1] * self.twiddles[0];
3663        scratch1[2] = scratch1[2] * self.twiddles[1];
3664        scratch1[3] = scratch1[3] * self.twiddles[2];
3665        scratch1[4] = scratch1[4] * self.twiddles[3];
3666        scratch1[5] = scratch1[5] * self.twiddles[4];
3667        scratch1[6] = scratch1[6] * self.twiddles[5];
3668        scratch1[7] = scratch1[7] * self.twiddles[6];
3669        scratch1[8] = scratch1[8] * self.twiddles[7];
3670        scratch2[1] = scratch2[1] * self.twiddles[1];
3671        scratch2[2] = scratch2[2] * self.twiddles[3];
3672        scratch2[3] = scratch2[3] * self.twiddles[5];
3673        scratch2[4] = scratch2[4] * self.twiddles[7];
3674        scratch2[5] = scratch2[5] * self.twiddles[8];
3675        scratch2[6] = scratch2[6] * self.twiddles[9];
3676        scratch2[7] = scratch2[7] * self.twiddles[10];
3677        scratch2[8] = scratch2[8] * self.twiddles[11];
3678
3679        // step 4: SKIPPED because the next FFTs will be non-contiguous
3680
3681        // step 5: row FFTs
3682        self.butterfly9.butterfly3.perform_fft_strided(
3683            &mut scratch0[0],
3684            &mut scratch1[0],
3685            &mut scratch2[0],
3686        );
3687        self.butterfly9.butterfly3.perform_fft_strided(
3688            &mut scratch0[1],
3689            &mut scratch1[1],
3690            &mut scratch2[1],
3691        );
3692        self.butterfly9.butterfly3.perform_fft_strided(
3693            &mut scratch0[2],
3694            &mut scratch1[2],
3695            &mut scratch2[2],
3696        );
3697        self.butterfly9.butterfly3.perform_fft_strided(
3698            &mut scratch0[3],
3699            &mut scratch1[3],
3700            &mut scratch2[3],
3701        );
3702        self.butterfly9.butterfly3.perform_fft_strided(
3703            &mut scratch0[4],
3704            &mut scratch1[4],
3705            &mut scratch2[4],
3706        );
3707        self.butterfly9.butterfly3.perform_fft_strided(
3708            &mut scratch0[5],
3709            &mut scratch1[5],
3710            &mut scratch2[5],
3711        );
3712        self.butterfly9.butterfly3.perform_fft_strided(
3713            &mut scratch0[6],
3714            &mut scratch1[6],
3715            &mut scratch2[6],
3716        );
3717        self.butterfly9.butterfly3.perform_fft_strided(
3718            &mut scratch0[7],
3719            &mut scratch1[7],
3720            &mut scratch2[7],
3721        );
3722        self.butterfly9.butterfly3.perform_fft_strided(
3723            &mut scratch0[8],
3724            &mut scratch1[8],
3725            &mut scratch2[8],
3726        );
3727
3728        // step 6: copy the result into the output. normally we'd need to do a transpose here, but we can skip it because we skipped the transpose in step 4
3729        buffer.store(scratch0[0], 0);
3730        buffer.store(scratch0[1], 1);
3731        buffer.store(scratch0[2], 2);
3732        buffer.store(scratch0[3], 3);
3733        buffer.store(scratch0[4], 4);
3734        buffer.store(scratch0[5], 5);
3735        buffer.store(scratch0[6], 6);
3736        buffer.store(scratch0[7], 7);
3737        buffer.store(scratch0[8], 8);
3738
3739        buffer.store(scratch1[0], 9 + 0);
3740        buffer.store(scratch1[1], 9 + 1);
3741        buffer.store(scratch1[2], 9 + 2);
3742        buffer.store(scratch1[3], 9 + 3);
3743        buffer.store(scratch1[4], 9 + 4);
3744        buffer.store(scratch1[5], 9 + 5);
3745        buffer.store(scratch1[6], 9 + 6);
3746        buffer.store(scratch1[7], 9 + 7);
3747        buffer.store(scratch1[8], 9 + 8);
3748
3749        buffer.store(scratch2[0], 18 + 0);
3750        buffer.store(scratch2[1], 18 + 1);
3751        buffer.store(scratch2[2], 18 + 2);
3752        buffer.store(scratch2[3], 18 + 3);
3753        buffer.store(scratch2[4], 18 + 4);
3754        buffer.store(scratch2[5], 18 + 5);
3755        buffer.store(scratch2[6], 18 + 6);
3756        buffer.store(scratch2[7], 18 + 7);
3757        buffer.store(scratch2[8], 18 + 8);
3758    }
3759}
3760
3761pub struct Butterfly29<T> {
3762    twiddle1: Complex<T>,
3763    twiddle2: Complex<T>,
3764    twiddle3: Complex<T>,
3765    twiddle4: Complex<T>,
3766    twiddle5: Complex<T>,
3767    twiddle6: Complex<T>,
3768    twiddle7: Complex<T>,
3769    twiddle8: Complex<T>,
3770    twiddle9: Complex<T>,
3771    twiddle10: Complex<T>,
3772    twiddle11: Complex<T>,
3773    twiddle12: Complex<T>,
3774    twiddle13: Complex<T>,
3775    twiddle14: Complex<T>,
3776    direction: FftDirection,
3777}
3778boilerplate_fft_butterfly!(Butterfly29, 29, |this: &Butterfly29<_>| this.direction);
3779impl<T: FftNum> Butterfly29<T> {
3780    pub fn new(direction: FftDirection) -> Self {
3781        let twiddle1: Complex<T> = twiddles::compute_twiddle(1, 29, direction);
3782        let twiddle2: Complex<T> = twiddles::compute_twiddle(2, 29, direction);
3783        let twiddle3: Complex<T> = twiddles::compute_twiddle(3, 29, direction);
3784        let twiddle4: Complex<T> = twiddles::compute_twiddle(4, 29, direction);
3785        let twiddle5: Complex<T> = twiddles::compute_twiddle(5, 29, direction);
3786        let twiddle6: Complex<T> = twiddles::compute_twiddle(6, 29, direction);
3787        let twiddle7: Complex<T> = twiddles::compute_twiddle(7, 29, direction);
3788        let twiddle8: Complex<T> = twiddles::compute_twiddle(8, 29, direction);
3789        let twiddle9: Complex<T> = twiddles::compute_twiddle(9, 29, direction);
3790        let twiddle10: Complex<T> = twiddles::compute_twiddle(10, 29, direction);
3791        let twiddle11: Complex<T> = twiddles::compute_twiddle(11, 29, direction);
3792        let twiddle12: Complex<T> = twiddles::compute_twiddle(12, 29, direction);
3793        let twiddle13: Complex<T> = twiddles::compute_twiddle(13, 29, direction);
3794        let twiddle14: Complex<T> = twiddles::compute_twiddle(14, 29, direction);
3795        Self {
3796            twiddle1,
3797            twiddle2,
3798            twiddle3,
3799            twiddle4,
3800            twiddle5,
3801            twiddle6,
3802            twiddle7,
3803            twiddle8,
3804            twiddle9,
3805            twiddle10,
3806            twiddle11,
3807            twiddle12,
3808            twiddle13,
3809            twiddle14,
3810            direction,
3811        }
3812    }
3813
3814    #[inline(never)]
3815    unsafe fn perform_fft_contiguous(&self, mut buffer: impl LoadStore<T>) {
3816        // This function was derived in the same manner as the butterflies for length 3, 5 and 7.
3817        // However, instead of doing it by hand the actual code is autogenerated
3818        // with the `genbutterflies.py` script in the `tools` directory.
3819        let x128p = buffer.load(1) + buffer.load(28);
3820        let x128n = buffer.load(1) - buffer.load(28);
3821        let x227p = buffer.load(2) + buffer.load(27);
3822        let x227n = buffer.load(2) - buffer.load(27);
3823        let x326p = buffer.load(3) + buffer.load(26);
3824        let x326n = buffer.load(3) - buffer.load(26);
3825        let x425p = buffer.load(4) + buffer.load(25);
3826        let x425n = buffer.load(4) - buffer.load(25);
3827        let x524p = buffer.load(5) + buffer.load(24);
3828        let x524n = buffer.load(5) - buffer.load(24);
3829        let x623p = buffer.load(6) + buffer.load(23);
3830        let x623n = buffer.load(6) - buffer.load(23);
3831        let x722p = buffer.load(7) + buffer.load(22);
3832        let x722n = buffer.load(7) - buffer.load(22);
3833        let x821p = buffer.load(8) + buffer.load(21);
3834        let x821n = buffer.load(8) - buffer.load(21);
3835        let x920p = buffer.load(9) + buffer.load(20);
3836        let x920n = buffer.load(9) - buffer.load(20);
3837        let x1019p = buffer.load(10) + buffer.load(19);
3838        let x1019n = buffer.load(10) - buffer.load(19);
3839        let x1118p = buffer.load(11) + buffer.load(18);
3840        let x1118n = buffer.load(11) - buffer.load(18);
3841        let x1217p = buffer.load(12) + buffer.load(17);
3842        let x1217n = buffer.load(12) - buffer.load(17);
3843        let x1316p = buffer.load(13) + buffer.load(16);
3844        let x1316n = buffer.load(13) - buffer.load(16);
3845        let x1415p = buffer.load(14) + buffer.load(15);
3846        let x1415n = buffer.load(14) - buffer.load(15);
3847        let sum = buffer.load(0)
3848            + x128p
3849            + x227p
3850            + x326p
3851            + x425p
3852            + x524p
3853            + x623p
3854            + x722p
3855            + x821p
3856            + x920p
3857            + x1019p
3858            + x1118p
3859            + x1217p
3860            + x1316p
3861            + x1415p;
3862        let b128re_a = buffer.load(0).re
3863            + self.twiddle1.re * x128p.re
3864            + self.twiddle2.re * x227p.re
3865            + self.twiddle3.re * x326p.re
3866            + self.twiddle4.re * x425p.re
3867            + self.twiddle5.re * x524p.re
3868            + self.twiddle6.re * x623p.re
3869            + self.twiddle7.re * x722p.re
3870            + self.twiddle8.re * x821p.re
3871            + self.twiddle9.re * x920p.re
3872            + self.twiddle10.re * x1019p.re
3873            + self.twiddle11.re * x1118p.re
3874            + self.twiddle12.re * x1217p.re
3875            + self.twiddle13.re * x1316p.re
3876            + self.twiddle14.re * x1415p.re;
3877        let b128re_b = self.twiddle1.im * x128n.im
3878            + self.twiddle2.im * x227n.im
3879            + self.twiddle3.im * x326n.im
3880            + self.twiddle4.im * x425n.im
3881            + self.twiddle5.im * x524n.im
3882            + self.twiddle6.im * x623n.im
3883            + self.twiddle7.im * x722n.im
3884            + self.twiddle8.im * x821n.im
3885            + self.twiddle9.im * x920n.im
3886            + self.twiddle10.im * x1019n.im
3887            + self.twiddle11.im * x1118n.im
3888            + self.twiddle12.im * x1217n.im
3889            + self.twiddle13.im * x1316n.im
3890            + self.twiddle14.im * x1415n.im;
3891        let b227re_a = buffer.load(0).re
3892            + self.twiddle2.re * x128p.re
3893            + self.twiddle4.re * x227p.re
3894            + self.twiddle6.re * x326p.re
3895            + self.twiddle8.re * x425p.re
3896            + self.twiddle10.re * x524p.re
3897            + self.twiddle12.re * x623p.re
3898            + self.twiddle14.re * x722p.re
3899            + self.twiddle13.re * x821p.re
3900            + self.twiddle11.re * x920p.re
3901            + self.twiddle9.re * x1019p.re
3902            + self.twiddle7.re * x1118p.re
3903            + self.twiddle5.re * x1217p.re
3904            + self.twiddle3.re * x1316p.re
3905            + self.twiddle1.re * x1415p.re;
3906        let b227re_b = self.twiddle2.im * x128n.im
3907            + self.twiddle4.im * x227n.im
3908            + self.twiddle6.im * x326n.im
3909            + self.twiddle8.im * x425n.im
3910            + self.twiddle10.im * x524n.im
3911            + self.twiddle12.im * x623n.im
3912            + self.twiddle14.im * x722n.im
3913            + -self.twiddle13.im * x821n.im
3914            + -self.twiddle11.im * x920n.im
3915            + -self.twiddle9.im * x1019n.im
3916            + -self.twiddle7.im * x1118n.im
3917            + -self.twiddle5.im * x1217n.im
3918            + -self.twiddle3.im * x1316n.im
3919            + -self.twiddle1.im * x1415n.im;
3920        let b326re_a = buffer.load(0).re
3921            + self.twiddle3.re * x128p.re
3922            + self.twiddle6.re * x227p.re
3923            + self.twiddle9.re * x326p.re
3924            + self.twiddle12.re * x425p.re
3925            + self.twiddle14.re * x524p.re
3926            + self.twiddle11.re * x623p.re
3927            + self.twiddle8.re * x722p.re
3928            + self.twiddle5.re * x821p.re
3929            + self.twiddle2.re * x920p.re
3930            + self.twiddle1.re * x1019p.re
3931            + self.twiddle4.re * x1118p.re
3932            + self.twiddle7.re * x1217p.re
3933            + self.twiddle10.re * x1316p.re
3934            + self.twiddle13.re * x1415p.re;
3935        let b326re_b = self.twiddle3.im * x128n.im
3936            + self.twiddle6.im * x227n.im
3937            + self.twiddle9.im * x326n.im
3938            + self.twiddle12.im * x425n.im
3939            + -self.twiddle14.im * x524n.im
3940            + -self.twiddle11.im * x623n.im
3941            + -self.twiddle8.im * x722n.im
3942            + -self.twiddle5.im * x821n.im
3943            + -self.twiddle2.im * x920n.im
3944            + self.twiddle1.im * x1019n.im
3945            + self.twiddle4.im * x1118n.im
3946            + self.twiddle7.im * x1217n.im
3947            + self.twiddle10.im * x1316n.im
3948            + self.twiddle13.im * x1415n.im;
3949        let b425re_a = buffer.load(0).re
3950            + self.twiddle4.re * x128p.re
3951            + self.twiddle8.re * x227p.re
3952            + self.twiddle12.re * x326p.re
3953            + self.twiddle13.re * x425p.re
3954            + self.twiddle9.re * x524p.re
3955            + self.twiddle5.re * x623p.re
3956            + self.twiddle1.re * x722p.re
3957            + self.twiddle3.re * x821p.re
3958            + self.twiddle7.re * x920p.re
3959            + self.twiddle11.re * x1019p.re
3960            + self.twiddle14.re * x1118p.re
3961            + self.twiddle10.re * x1217p.re
3962            + self.twiddle6.re * x1316p.re
3963            + self.twiddle2.re * x1415p.re;
3964        let b425re_b = self.twiddle4.im * x128n.im
3965            + self.twiddle8.im * x227n.im
3966            + self.twiddle12.im * x326n.im
3967            + -self.twiddle13.im * x425n.im
3968            + -self.twiddle9.im * x524n.im
3969            + -self.twiddle5.im * x623n.im
3970            + -self.twiddle1.im * x722n.im
3971            + self.twiddle3.im * x821n.im
3972            + self.twiddle7.im * x920n.im
3973            + self.twiddle11.im * x1019n.im
3974            + -self.twiddle14.im * x1118n.im
3975            + -self.twiddle10.im * x1217n.im
3976            + -self.twiddle6.im * x1316n.im
3977            + -self.twiddle2.im * x1415n.im;
3978        let b524re_a = buffer.load(0).re
3979            + self.twiddle5.re * x128p.re
3980            + self.twiddle10.re * x227p.re
3981            + self.twiddle14.re * x326p.re
3982            + self.twiddle9.re * x425p.re
3983            + self.twiddle4.re * x524p.re
3984            + self.twiddle1.re * x623p.re
3985            + self.twiddle6.re * x722p.re
3986            + self.twiddle11.re * x821p.re
3987            + self.twiddle13.re * x920p.re
3988            + self.twiddle8.re * x1019p.re
3989            + self.twiddle3.re * x1118p.re
3990            + self.twiddle2.re * x1217p.re
3991            + self.twiddle7.re * x1316p.re
3992            + self.twiddle12.re * x1415p.re;
3993        let b524re_b = self.twiddle5.im * x128n.im
3994            + self.twiddle10.im * x227n.im
3995            + -self.twiddle14.im * x326n.im
3996            + -self.twiddle9.im * x425n.im
3997            + -self.twiddle4.im * x524n.im
3998            + self.twiddle1.im * x623n.im
3999            + self.twiddle6.im * x722n.im
4000            + self.twiddle11.im * x821n.im
4001            + -self.twiddle13.im * x920n.im
4002            + -self.twiddle8.im * x1019n.im
4003            + -self.twiddle3.im * x1118n.im
4004            + self.twiddle2.im * x1217n.im
4005            + self.twiddle7.im * x1316n.im
4006            + self.twiddle12.im * x1415n.im;
4007        let b623re_a = buffer.load(0).re
4008            + self.twiddle6.re * x128p.re
4009            + self.twiddle12.re * x227p.re
4010            + self.twiddle11.re * x326p.re
4011            + self.twiddle5.re * x425p.re
4012            + self.twiddle1.re * x524p.re
4013            + self.twiddle7.re * x623p.re
4014            + self.twiddle13.re * x722p.re
4015            + self.twiddle10.re * x821p.re
4016            + self.twiddle4.re * x920p.re
4017            + self.twiddle2.re * x1019p.re
4018            + self.twiddle8.re * x1118p.re
4019            + self.twiddle14.re * x1217p.re
4020            + self.twiddle9.re * x1316p.re
4021            + self.twiddle3.re * x1415p.re;
4022        let b623re_b = self.twiddle6.im * x128n.im
4023            + self.twiddle12.im * x227n.im
4024            + -self.twiddle11.im * x326n.im
4025            + -self.twiddle5.im * x425n.im
4026            + self.twiddle1.im * x524n.im
4027            + self.twiddle7.im * x623n.im
4028            + self.twiddle13.im * x722n.im
4029            + -self.twiddle10.im * x821n.im
4030            + -self.twiddle4.im * x920n.im
4031            + self.twiddle2.im * x1019n.im
4032            + self.twiddle8.im * x1118n.im
4033            + self.twiddle14.im * x1217n.im
4034            + -self.twiddle9.im * x1316n.im
4035            + -self.twiddle3.im * x1415n.im;
4036        let b722re_a = buffer.load(0).re
4037            + self.twiddle7.re * x128p.re
4038            + self.twiddle14.re * x227p.re
4039            + self.twiddle8.re * x326p.re
4040            + self.twiddle1.re * x425p.re
4041            + self.twiddle6.re * x524p.re
4042            + self.twiddle13.re * x623p.re
4043            + self.twiddle9.re * x722p.re
4044            + self.twiddle2.re * x821p.re
4045            + self.twiddle5.re * x920p.re
4046            + self.twiddle12.re * x1019p.re
4047            + self.twiddle10.re * x1118p.re
4048            + self.twiddle3.re * x1217p.re
4049            + self.twiddle4.re * x1316p.re
4050            + self.twiddle11.re * x1415p.re;
4051        let b722re_b = self.twiddle7.im * x128n.im
4052            + self.twiddle14.im * x227n.im
4053            + -self.twiddle8.im * x326n.im
4054            + -self.twiddle1.im * x425n.im
4055            + self.twiddle6.im * x524n.im
4056            + self.twiddle13.im * x623n.im
4057            + -self.twiddle9.im * x722n.im
4058            + -self.twiddle2.im * x821n.im
4059            + self.twiddle5.im * x920n.im
4060            + self.twiddle12.im * x1019n.im
4061            + -self.twiddle10.im * x1118n.im
4062            + -self.twiddle3.im * x1217n.im
4063            + self.twiddle4.im * x1316n.im
4064            + self.twiddle11.im * x1415n.im;
4065        let b821re_a = buffer.load(0).re
4066            + self.twiddle8.re * x128p.re
4067            + self.twiddle13.re * x227p.re
4068            + self.twiddle5.re * x326p.re
4069            + self.twiddle3.re * x425p.re
4070            + self.twiddle11.re * x524p.re
4071            + self.twiddle10.re * x623p.re
4072            + self.twiddle2.re * x722p.re
4073            + self.twiddle6.re * x821p.re
4074            + self.twiddle14.re * x920p.re
4075            + self.twiddle7.re * x1019p.re
4076            + self.twiddle1.re * x1118p.re
4077            + self.twiddle9.re * x1217p.re
4078            + self.twiddle12.re * x1316p.re
4079            + self.twiddle4.re * x1415p.re;
4080        let b821re_b = self.twiddle8.im * x128n.im
4081            + -self.twiddle13.im * x227n.im
4082            + -self.twiddle5.im * x326n.im
4083            + self.twiddle3.im * x425n.im
4084            + self.twiddle11.im * x524n.im
4085            + -self.twiddle10.im * x623n.im
4086            + -self.twiddle2.im * x722n.im
4087            + self.twiddle6.im * x821n.im
4088            + self.twiddle14.im * x920n.im
4089            + -self.twiddle7.im * x1019n.im
4090            + self.twiddle1.im * x1118n.im
4091            + self.twiddle9.im * x1217n.im
4092            + -self.twiddle12.im * x1316n.im
4093            + -self.twiddle4.im * x1415n.im;
4094        let b920re_a = buffer.load(0).re
4095            + self.twiddle9.re * x128p.re
4096            + self.twiddle11.re * x227p.re
4097            + self.twiddle2.re * x326p.re
4098            + self.twiddle7.re * x425p.re
4099            + self.twiddle13.re * x524p.re
4100            + self.twiddle4.re * x623p.re
4101            + self.twiddle5.re * x722p.re
4102            + self.twiddle14.re * x821p.re
4103            + self.twiddle6.re * x920p.re
4104            + self.twiddle3.re * x1019p.re
4105            + self.twiddle12.re * x1118p.re
4106            + self.twiddle8.re * x1217p.re
4107            + self.twiddle1.re * x1316p.re
4108            + self.twiddle10.re * x1415p.re;
4109        let b920re_b = self.twiddle9.im * x128n.im
4110            + -self.twiddle11.im * x227n.im
4111            + -self.twiddle2.im * x326n.im
4112            + self.twiddle7.im * x425n.im
4113            + -self.twiddle13.im * x524n.im
4114            + -self.twiddle4.im * x623n.im
4115            + self.twiddle5.im * x722n.im
4116            + self.twiddle14.im * x821n.im
4117            + -self.twiddle6.im * x920n.im
4118            + self.twiddle3.im * x1019n.im
4119            + self.twiddle12.im * x1118n.im
4120            + -self.twiddle8.im * x1217n.im
4121            + self.twiddle1.im * x1316n.im
4122            + self.twiddle10.im * x1415n.im;
4123        let b1019re_a = buffer.load(0).re
4124            + self.twiddle10.re * x128p.re
4125            + self.twiddle9.re * x227p.re
4126            + self.twiddle1.re * x326p.re
4127            + self.twiddle11.re * x425p.re
4128            + self.twiddle8.re * x524p.re
4129            + self.twiddle2.re * x623p.re
4130            + self.twiddle12.re * x722p.re
4131            + self.twiddle7.re * x821p.re
4132            + self.twiddle3.re * x920p.re
4133            + self.twiddle13.re * x1019p.re
4134            + self.twiddle6.re * x1118p.re
4135            + self.twiddle4.re * x1217p.re
4136            + self.twiddle14.re * x1316p.re
4137            + self.twiddle5.re * x1415p.re;
4138        let b1019re_b = self.twiddle10.im * x128n.im
4139            + -self.twiddle9.im * x227n.im
4140            + self.twiddle1.im * x326n.im
4141            + self.twiddle11.im * x425n.im
4142            + -self.twiddle8.im * x524n.im
4143            + self.twiddle2.im * x623n.im
4144            + self.twiddle12.im * x722n.im
4145            + -self.twiddle7.im * x821n.im
4146            + self.twiddle3.im * x920n.im
4147            + self.twiddle13.im * x1019n.im
4148            + -self.twiddle6.im * x1118n.im
4149            + self.twiddle4.im * x1217n.im
4150            + self.twiddle14.im * x1316n.im
4151            + -self.twiddle5.im * x1415n.im;
4152        let b1118re_a = buffer.load(0).re
4153            + self.twiddle11.re * x128p.re
4154            + self.twiddle7.re * x227p.re
4155            + self.twiddle4.re * x326p.re
4156            + self.twiddle14.re * x425p.re
4157            + self.twiddle3.re * x524p.re
4158            + self.twiddle8.re * x623p.re
4159            + self.twiddle10.re * x722p.re
4160            + self.twiddle1.re * x821p.re
4161            + self.twiddle12.re * x920p.re
4162            + self.twiddle6.re * x1019p.re
4163            + self.twiddle5.re * x1118p.re
4164            + self.twiddle13.re * x1217p.re
4165            + self.twiddle2.re * x1316p.re
4166            + self.twiddle9.re * x1415p.re;
4167        let b1118re_b = self.twiddle11.im * x128n.im
4168            + -self.twiddle7.im * x227n.im
4169            + self.twiddle4.im * x326n.im
4170            + -self.twiddle14.im * x425n.im
4171            + -self.twiddle3.im * x524n.im
4172            + self.twiddle8.im * x623n.im
4173            + -self.twiddle10.im * x722n.im
4174            + self.twiddle1.im * x821n.im
4175            + self.twiddle12.im * x920n.im
4176            + -self.twiddle6.im * x1019n.im
4177            + self.twiddle5.im * x1118n.im
4178            + -self.twiddle13.im * x1217n.im
4179            + -self.twiddle2.im * x1316n.im
4180            + self.twiddle9.im * x1415n.im;
4181        let b1217re_a = buffer.load(0).re
4182            + self.twiddle12.re * x128p.re
4183            + self.twiddle5.re * x227p.re
4184            + self.twiddle7.re * x326p.re
4185            + self.twiddle10.re * x425p.re
4186            + self.twiddle2.re * x524p.re
4187            + self.twiddle14.re * x623p.re
4188            + self.twiddle3.re * x722p.re
4189            + self.twiddle9.re * x821p.re
4190            + self.twiddle8.re * x920p.re
4191            + self.twiddle4.re * x1019p.re
4192            + self.twiddle13.re * x1118p.re
4193            + self.twiddle1.re * x1217p.re
4194            + self.twiddle11.re * x1316p.re
4195            + self.twiddle6.re * x1415p.re;
4196        let b1217re_b = self.twiddle12.im * x128n.im
4197            + -self.twiddle5.im * x227n.im
4198            + self.twiddle7.im * x326n.im
4199            + -self.twiddle10.im * x425n.im
4200            + self.twiddle2.im * x524n.im
4201            + self.twiddle14.im * x623n.im
4202            + -self.twiddle3.im * x722n.im
4203            + self.twiddle9.im * x821n.im
4204            + -self.twiddle8.im * x920n.im
4205            + self.twiddle4.im * x1019n.im
4206            + -self.twiddle13.im * x1118n.im
4207            + -self.twiddle1.im * x1217n.im
4208            + self.twiddle11.im * x1316n.im
4209            + -self.twiddle6.im * x1415n.im;
4210        let b1316re_a = buffer.load(0).re
4211            + self.twiddle13.re * x128p.re
4212            + self.twiddle3.re * x227p.re
4213            + self.twiddle10.re * x326p.re
4214            + self.twiddle6.re * x425p.re
4215            + self.twiddle7.re * x524p.re
4216            + self.twiddle9.re * x623p.re
4217            + self.twiddle4.re * x722p.re
4218            + self.twiddle12.re * x821p.re
4219            + self.twiddle1.re * x920p.re
4220            + self.twiddle14.re * x1019p.re
4221            + self.twiddle2.re * x1118p.re
4222            + self.twiddle11.re * x1217p.re
4223            + self.twiddle5.re * x1316p.re
4224            + self.twiddle8.re * x1415p.re;
4225        let b1316re_b = self.twiddle13.im * x128n.im
4226            + -self.twiddle3.im * x227n.im
4227            + self.twiddle10.im * x326n.im
4228            + -self.twiddle6.im * x425n.im
4229            + self.twiddle7.im * x524n.im
4230            + -self.twiddle9.im * x623n.im
4231            + self.twiddle4.im * x722n.im
4232            + -self.twiddle12.im * x821n.im
4233            + self.twiddle1.im * x920n.im
4234            + self.twiddle14.im * x1019n.im
4235            + -self.twiddle2.im * x1118n.im
4236            + self.twiddle11.im * x1217n.im
4237            + -self.twiddle5.im * x1316n.im
4238            + self.twiddle8.im * x1415n.im;
4239        let b1415re_a = buffer.load(0).re
4240            + self.twiddle14.re * x128p.re
4241            + self.twiddle1.re * x227p.re
4242            + self.twiddle13.re * x326p.re
4243            + self.twiddle2.re * x425p.re
4244            + self.twiddle12.re * x524p.re
4245            + self.twiddle3.re * x623p.re
4246            + self.twiddle11.re * x722p.re
4247            + self.twiddle4.re * x821p.re
4248            + self.twiddle10.re * x920p.re
4249            + self.twiddle5.re * x1019p.re
4250            + self.twiddle9.re * x1118p.re
4251            + self.twiddle6.re * x1217p.re
4252            + self.twiddle8.re * x1316p.re
4253            + self.twiddle7.re * x1415p.re;
4254        let b1415re_b = self.twiddle14.im * x128n.im
4255            + -self.twiddle1.im * x227n.im
4256            + self.twiddle13.im * x326n.im
4257            + -self.twiddle2.im * x425n.im
4258            + self.twiddle12.im * x524n.im
4259            + -self.twiddle3.im * x623n.im
4260            + self.twiddle11.im * x722n.im
4261            + -self.twiddle4.im * x821n.im
4262            + self.twiddle10.im * x920n.im
4263            + -self.twiddle5.im * x1019n.im
4264            + self.twiddle9.im * x1118n.im
4265            + -self.twiddle6.im * x1217n.im
4266            + self.twiddle8.im * x1316n.im
4267            + -self.twiddle7.im * x1415n.im;
4268
4269        let b128im_a = buffer.load(0).im
4270            + self.twiddle1.re * x128p.im
4271            + self.twiddle2.re * x227p.im
4272            + self.twiddle3.re * x326p.im
4273            + self.twiddle4.re * x425p.im
4274            + self.twiddle5.re * x524p.im
4275            + self.twiddle6.re * x623p.im
4276            + self.twiddle7.re * x722p.im
4277            + self.twiddle8.re * x821p.im
4278            + self.twiddle9.re * x920p.im
4279            + self.twiddle10.re * x1019p.im
4280            + self.twiddle11.re * x1118p.im
4281            + self.twiddle12.re * x1217p.im
4282            + self.twiddle13.re * x1316p.im
4283            + self.twiddle14.re * x1415p.im;
4284        let b128im_b = self.twiddle1.im * x128n.re
4285            + self.twiddle2.im * x227n.re
4286            + self.twiddle3.im * x326n.re
4287            + self.twiddle4.im * x425n.re
4288            + self.twiddle5.im * x524n.re
4289            + self.twiddle6.im * x623n.re
4290            + self.twiddle7.im * x722n.re
4291            + self.twiddle8.im * x821n.re
4292            + self.twiddle9.im * x920n.re
4293            + self.twiddle10.im * x1019n.re
4294            + self.twiddle11.im * x1118n.re
4295            + self.twiddle12.im * x1217n.re
4296            + self.twiddle13.im * x1316n.re
4297            + self.twiddle14.im * x1415n.re;
4298        let b227im_a = buffer.load(0).im
4299            + self.twiddle2.re * x128p.im
4300            + self.twiddle4.re * x227p.im
4301            + self.twiddle6.re * x326p.im
4302            + self.twiddle8.re * x425p.im
4303            + self.twiddle10.re * x524p.im
4304            + self.twiddle12.re * x623p.im
4305            + self.twiddle14.re * x722p.im
4306            + self.twiddle13.re * x821p.im
4307            + self.twiddle11.re * x920p.im
4308            + self.twiddle9.re * x1019p.im
4309            + self.twiddle7.re * x1118p.im
4310            + self.twiddle5.re * x1217p.im
4311            + self.twiddle3.re * x1316p.im
4312            + self.twiddle1.re * x1415p.im;
4313        let b227im_b = self.twiddle2.im * x128n.re
4314            + self.twiddle4.im * x227n.re
4315            + self.twiddle6.im * x326n.re
4316            + self.twiddle8.im * x425n.re
4317            + self.twiddle10.im * x524n.re
4318            + self.twiddle12.im * x623n.re
4319            + self.twiddle14.im * x722n.re
4320            + -self.twiddle13.im * x821n.re
4321            + -self.twiddle11.im * x920n.re
4322            + -self.twiddle9.im * x1019n.re
4323            + -self.twiddle7.im * x1118n.re
4324            + -self.twiddle5.im * x1217n.re
4325            + -self.twiddle3.im * x1316n.re
4326            + -self.twiddle1.im * x1415n.re;
4327        let b326im_a = buffer.load(0).im
4328            + self.twiddle3.re * x128p.im
4329            + self.twiddle6.re * x227p.im
4330            + self.twiddle9.re * x326p.im
4331            + self.twiddle12.re * x425p.im
4332            + self.twiddle14.re * x524p.im
4333            + self.twiddle11.re * x623p.im
4334            + self.twiddle8.re * x722p.im
4335            + self.twiddle5.re * x821p.im
4336            + self.twiddle2.re * x920p.im
4337            + self.twiddle1.re * x1019p.im
4338            + self.twiddle4.re * x1118p.im
4339            + self.twiddle7.re * x1217p.im
4340            + self.twiddle10.re * x1316p.im
4341            + self.twiddle13.re * x1415p.im;
4342        let b326im_b = self.twiddle3.im * x128n.re
4343            + self.twiddle6.im * x227n.re
4344            + self.twiddle9.im * x326n.re
4345            + self.twiddle12.im * x425n.re
4346            + -self.twiddle14.im * x524n.re
4347            + -self.twiddle11.im * x623n.re
4348            + -self.twiddle8.im * x722n.re
4349            + -self.twiddle5.im * x821n.re
4350            + -self.twiddle2.im * x920n.re
4351            + self.twiddle1.im * x1019n.re
4352            + self.twiddle4.im * x1118n.re
4353            + self.twiddle7.im * x1217n.re
4354            + self.twiddle10.im * x1316n.re
4355            + self.twiddle13.im * x1415n.re;
4356        let b425im_a = buffer.load(0).im
4357            + self.twiddle4.re * x128p.im
4358            + self.twiddle8.re * x227p.im
4359            + self.twiddle12.re * x326p.im
4360            + self.twiddle13.re * x425p.im
4361            + self.twiddle9.re * x524p.im
4362            + self.twiddle5.re * x623p.im
4363            + self.twiddle1.re * x722p.im
4364            + self.twiddle3.re * x821p.im
4365            + self.twiddle7.re * x920p.im
4366            + self.twiddle11.re * x1019p.im
4367            + self.twiddle14.re * x1118p.im
4368            + self.twiddle10.re * x1217p.im
4369            + self.twiddle6.re * x1316p.im
4370            + self.twiddle2.re * x1415p.im;
4371        let b425im_b = self.twiddle4.im * x128n.re
4372            + self.twiddle8.im * x227n.re
4373            + self.twiddle12.im * x326n.re
4374            + -self.twiddle13.im * x425n.re
4375            + -self.twiddle9.im * x524n.re
4376            + -self.twiddle5.im * x623n.re
4377            + -self.twiddle1.im * x722n.re
4378            + self.twiddle3.im * x821n.re
4379            + self.twiddle7.im * x920n.re
4380            + self.twiddle11.im * x1019n.re
4381            + -self.twiddle14.im * x1118n.re
4382            + -self.twiddle10.im * x1217n.re
4383            + -self.twiddle6.im * x1316n.re
4384            + -self.twiddle2.im * x1415n.re;
4385        let b524im_a = buffer.load(0).im
4386            + self.twiddle5.re * x128p.im
4387            + self.twiddle10.re * x227p.im
4388            + self.twiddle14.re * x326p.im
4389            + self.twiddle9.re * x425p.im
4390            + self.twiddle4.re * x524p.im
4391            + self.twiddle1.re * x623p.im
4392            + self.twiddle6.re * x722p.im
4393            + self.twiddle11.re * x821p.im
4394            + self.twiddle13.re * x920p.im
4395            + self.twiddle8.re * x1019p.im
4396            + self.twiddle3.re * x1118p.im
4397            + self.twiddle2.re * x1217p.im
4398            + self.twiddle7.re * x1316p.im
4399            + self.twiddle12.re * x1415p.im;
4400        let b524im_b = self.twiddle5.im * x128n.re
4401            + self.twiddle10.im * x227n.re
4402            + -self.twiddle14.im * x326n.re
4403            + -self.twiddle9.im * x425n.re
4404            + -self.twiddle4.im * x524n.re
4405            + self.twiddle1.im * x623n.re
4406            + self.twiddle6.im * x722n.re
4407            + self.twiddle11.im * x821n.re
4408            + -self.twiddle13.im * x920n.re
4409            + -self.twiddle8.im * x1019n.re
4410            + -self.twiddle3.im * x1118n.re
4411            + self.twiddle2.im * x1217n.re
4412            + self.twiddle7.im * x1316n.re
4413            + self.twiddle12.im * x1415n.re;
4414        let b623im_a = buffer.load(0).im
4415            + self.twiddle6.re * x128p.im
4416            + self.twiddle12.re * x227p.im
4417            + self.twiddle11.re * x326p.im
4418            + self.twiddle5.re * x425p.im
4419            + self.twiddle1.re * x524p.im
4420            + self.twiddle7.re * x623p.im
4421            + self.twiddle13.re * x722p.im
4422            + self.twiddle10.re * x821p.im
4423            + self.twiddle4.re * x920p.im
4424            + self.twiddle2.re * x1019p.im
4425            + self.twiddle8.re * x1118p.im
4426            + self.twiddle14.re * x1217p.im
4427            + self.twiddle9.re * x1316p.im
4428            + self.twiddle3.re * x1415p.im;
4429        let b623im_b = self.twiddle6.im * x128n.re
4430            + self.twiddle12.im * x227n.re
4431            + -self.twiddle11.im * x326n.re
4432            + -self.twiddle5.im * x425n.re
4433            + self.twiddle1.im * x524n.re
4434            + self.twiddle7.im * x623n.re
4435            + self.twiddle13.im * x722n.re
4436            + -self.twiddle10.im * x821n.re
4437            + -self.twiddle4.im * x920n.re
4438            + self.twiddle2.im * x1019n.re
4439            + self.twiddle8.im * x1118n.re
4440            + self.twiddle14.im * x1217n.re
4441            + -self.twiddle9.im * x1316n.re
4442            + -self.twiddle3.im * x1415n.re;
4443        let b722im_a = buffer.load(0).im
4444            + self.twiddle7.re * x128p.im
4445            + self.twiddle14.re * x227p.im
4446            + self.twiddle8.re * x326p.im
4447            + self.twiddle1.re * x425p.im
4448            + self.twiddle6.re * x524p.im
4449            + self.twiddle13.re * x623p.im
4450            + self.twiddle9.re * x722p.im
4451            + self.twiddle2.re * x821p.im
4452            + self.twiddle5.re * x920p.im
4453            + self.twiddle12.re * x1019p.im
4454            + self.twiddle10.re * x1118p.im
4455            + self.twiddle3.re * x1217p.im
4456            + self.twiddle4.re * x1316p.im
4457            + self.twiddle11.re * x1415p.im;
4458        let b722im_b = self.twiddle7.im * x128n.re
4459            + self.twiddle14.im * x227n.re
4460            + -self.twiddle8.im * x326n.re
4461            + -self.twiddle1.im * x425n.re
4462            + self.twiddle6.im * x524n.re
4463            + self.twiddle13.im * x623n.re
4464            + -self.twiddle9.im * x722n.re
4465            + -self.twiddle2.im * x821n.re
4466            + self.twiddle5.im * x920n.re
4467            + self.twiddle12.im * x1019n.re
4468            + -self.twiddle10.im * x1118n.re
4469            + -self.twiddle3.im * x1217n.re
4470            + self.twiddle4.im * x1316n.re
4471            + self.twiddle11.im * x1415n.re;
4472        let b821im_a = buffer.load(0).im
4473            + self.twiddle8.re * x128p.im
4474            + self.twiddle13.re * x227p.im
4475            + self.twiddle5.re * x326p.im
4476            + self.twiddle3.re * x425p.im
4477            + self.twiddle11.re * x524p.im
4478            + self.twiddle10.re * x623p.im
4479            + self.twiddle2.re * x722p.im
4480            + self.twiddle6.re * x821p.im
4481            + self.twiddle14.re * x920p.im
4482            + self.twiddle7.re * x1019p.im
4483            + self.twiddle1.re * x1118p.im
4484            + self.twiddle9.re * x1217p.im
4485            + self.twiddle12.re * x1316p.im
4486            + self.twiddle4.re * x1415p.im;
4487        let b821im_b = self.twiddle8.im * x128n.re
4488            + -self.twiddle13.im * x227n.re
4489            + -self.twiddle5.im * x326n.re
4490            + self.twiddle3.im * x425n.re
4491            + self.twiddle11.im * x524n.re
4492            + -self.twiddle10.im * x623n.re
4493            + -self.twiddle2.im * x722n.re
4494            + self.twiddle6.im * x821n.re
4495            + self.twiddle14.im * x920n.re
4496            + -self.twiddle7.im * x1019n.re
4497            + self.twiddle1.im * x1118n.re
4498            + self.twiddle9.im * x1217n.re
4499            + -self.twiddle12.im * x1316n.re
4500            + -self.twiddle4.im * x1415n.re;
4501        let b920im_a = buffer.load(0).im
4502            + self.twiddle9.re * x128p.im
4503            + self.twiddle11.re * x227p.im
4504            + self.twiddle2.re * x326p.im
4505            + self.twiddle7.re * x425p.im
4506            + self.twiddle13.re * x524p.im
4507            + self.twiddle4.re * x623p.im
4508            + self.twiddle5.re * x722p.im
4509            + self.twiddle14.re * x821p.im
4510            + self.twiddle6.re * x920p.im
4511            + self.twiddle3.re * x1019p.im
4512            + self.twiddle12.re * x1118p.im
4513            + self.twiddle8.re * x1217p.im
4514            + self.twiddle1.re * x1316p.im
4515            + self.twiddle10.re * x1415p.im;
4516        let b920im_b = self.twiddle9.im * x128n.re
4517            + -self.twiddle11.im * x227n.re
4518            + -self.twiddle2.im * x326n.re
4519            + self.twiddle7.im * x425n.re
4520            + -self.twiddle13.im * x524n.re
4521            + -self.twiddle4.im * x623n.re
4522            + self.twiddle5.im * x722n.re
4523            + self.twiddle14.im * x821n.re
4524            + -self.twiddle6.im * x920n.re
4525            + self.twiddle3.im * x1019n.re
4526            + self.twiddle12.im * x1118n.re
4527            + -self.twiddle8.im * x1217n.re
4528            + self.twiddle1.im * x1316n.re
4529            + self.twiddle10.im * x1415n.re;
4530        let b1019im_a = buffer.load(0).im
4531            + self.twiddle10.re * x128p.im
4532            + self.twiddle9.re * x227p.im
4533            + self.twiddle1.re * x326p.im
4534            + self.twiddle11.re * x425p.im
4535            + self.twiddle8.re * x524p.im
4536            + self.twiddle2.re * x623p.im
4537            + self.twiddle12.re * x722p.im
4538            + self.twiddle7.re * x821p.im
4539            + self.twiddle3.re * x920p.im
4540            + self.twiddle13.re * x1019p.im
4541            + self.twiddle6.re * x1118p.im
4542            + self.twiddle4.re * x1217p.im
4543            + self.twiddle14.re * x1316p.im
4544            + self.twiddle5.re * x1415p.im;
4545        let b1019im_b = self.twiddle10.im * x128n.re
4546            + -self.twiddle9.im * x227n.re
4547            + self.twiddle1.im * x326n.re
4548            + self.twiddle11.im * x425n.re
4549            + -self.twiddle8.im * x524n.re
4550            + self.twiddle2.im * x623n.re
4551            + self.twiddle12.im * x722n.re
4552            + -self.twiddle7.im * x821n.re
4553            + self.twiddle3.im * x920n.re
4554            + self.twiddle13.im * x1019n.re
4555            + -self.twiddle6.im * x1118n.re
4556            + self.twiddle4.im * x1217n.re
4557            + self.twiddle14.im * x1316n.re
4558            + -self.twiddle5.im * x1415n.re;
4559        let b1118im_a = buffer.load(0).im
4560            + self.twiddle11.re * x128p.im
4561            + self.twiddle7.re * x227p.im
4562            + self.twiddle4.re * x326p.im
4563            + self.twiddle14.re * x425p.im
4564            + self.twiddle3.re * x524p.im
4565            + self.twiddle8.re * x623p.im
4566            + self.twiddle10.re * x722p.im
4567            + self.twiddle1.re * x821p.im
4568            + self.twiddle12.re * x920p.im
4569            + self.twiddle6.re * x1019p.im
4570            + self.twiddle5.re * x1118p.im
4571            + self.twiddle13.re * x1217p.im
4572            + self.twiddle2.re * x1316p.im
4573            + self.twiddle9.re * x1415p.im;
4574        let b1118im_b = self.twiddle11.im * x128n.re
4575            + -self.twiddle7.im * x227n.re
4576            + self.twiddle4.im * x326n.re
4577            + -self.twiddle14.im * x425n.re
4578            + -self.twiddle3.im * x524n.re
4579            + self.twiddle8.im * x623n.re
4580            + -self.twiddle10.im * x722n.re
4581            + self.twiddle1.im * x821n.re
4582            + self.twiddle12.im * x920n.re
4583            + -self.twiddle6.im * x1019n.re
4584            + self.twiddle5.im * x1118n.re
4585            + -self.twiddle13.im * x1217n.re
4586            + -self.twiddle2.im * x1316n.re
4587            + self.twiddle9.im * x1415n.re;
4588        let b1217im_a = buffer.load(0).im
4589            + self.twiddle12.re * x128p.im
4590            + self.twiddle5.re * x227p.im
4591            + self.twiddle7.re * x326p.im
4592            + self.twiddle10.re * x425p.im
4593            + self.twiddle2.re * x524p.im
4594            + self.twiddle14.re * x623p.im
4595            + self.twiddle3.re * x722p.im
4596            + self.twiddle9.re * x821p.im
4597            + self.twiddle8.re * x920p.im
4598            + self.twiddle4.re * x1019p.im
4599            + self.twiddle13.re * x1118p.im
4600            + self.twiddle1.re * x1217p.im
4601            + self.twiddle11.re * x1316p.im
4602            + self.twiddle6.re * x1415p.im;
4603        let b1217im_b = self.twiddle12.im * x128n.re
4604            + -self.twiddle5.im * x227n.re
4605            + self.twiddle7.im * x326n.re
4606            + -self.twiddle10.im * x425n.re
4607            + self.twiddle2.im * x524n.re
4608            + self.twiddle14.im * x623n.re
4609            + -self.twiddle3.im * x722n.re
4610            + self.twiddle9.im * x821n.re
4611            + -self.twiddle8.im * x920n.re
4612            + self.twiddle4.im * x1019n.re
4613            + -self.twiddle13.im * x1118n.re
4614            + -self.twiddle1.im * x1217n.re
4615            + self.twiddle11.im * x1316n.re
4616            + -self.twiddle6.im * x1415n.re;
4617        let b1316im_a = buffer.load(0).im
4618            + self.twiddle13.re * x128p.im
4619            + self.twiddle3.re * x227p.im
4620            + self.twiddle10.re * x326p.im
4621            + self.twiddle6.re * x425p.im
4622            + self.twiddle7.re * x524p.im
4623            + self.twiddle9.re * x623p.im
4624            + self.twiddle4.re * x722p.im
4625            + self.twiddle12.re * x821p.im
4626            + self.twiddle1.re * x920p.im
4627            + self.twiddle14.re * x1019p.im
4628            + self.twiddle2.re * x1118p.im
4629            + self.twiddle11.re * x1217p.im
4630            + self.twiddle5.re * x1316p.im
4631            + self.twiddle8.re * x1415p.im;
4632        let b1316im_b = self.twiddle13.im * x128n.re
4633            + -self.twiddle3.im * x227n.re
4634            + self.twiddle10.im * x326n.re
4635            + -self.twiddle6.im * x425n.re
4636            + self.twiddle7.im * x524n.re
4637            + -self.twiddle9.im * x623n.re
4638            + self.twiddle4.im * x722n.re
4639            + -self.twiddle12.im * x821n.re
4640            + self.twiddle1.im * x920n.re
4641            + self.twiddle14.im * x1019n.re
4642            + -self.twiddle2.im * x1118n.re
4643            + self.twiddle11.im * x1217n.re
4644            + -self.twiddle5.im * x1316n.re
4645            + self.twiddle8.im * x1415n.re;
4646        let b1415im_a = buffer.load(0).im
4647            + self.twiddle14.re * x128p.im
4648            + self.twiddle1.re * x227p.im
4649            + self.twiddle13.re * x326p.im
4650            + self.twiddle2.re * x425p.im
4651            + self.twiddle12.re * x524p.im
4652            + self.twiddle3.re * x623p.im
4653            + self.twiddle11.re * x722p.im
4654            + self.twiddle4.re * x821p.im
4655            + self.twiddle10.re * x920p.im
4656            + self.twiddle5.re * x1019p.im
4657            + self.twiddle9.re * x1118p.im
4658            + self.twiddle6.re * x1217p.im
4659            + self.twiddle8.re * x1316p.im
4660            + self.twiddle7.re * x1415p.im;
4661        let b1415im_b = self.twiddle14.im * x128n.re
4662            + -self.twiddle1.im * x227n.re
4663            + self.twiddle13.im * x326n.re
4664            + -self.twiddle2.im * x425n.re
4665            + self.twiddle12.im * x524n.re
4666            + -self.twiddle3.im * x623n.re
4667            + self.twiddle11.im * x722n.re
4668            + -self.twiddle4.im * x821n.re
4669            + self.twiddle10.im * x920n.re
4670            + -self.twiddle5.im * x1019n.re
4671            + self.twiddle9.im * x1118n.re
4672            + -self.twiddle6.im * x1217n.re
4673            + self.twiddle8.im * x1316n.re
4674            + -self.twiddle7.im * x1415n.re;
4675
4676        let out1re = b128re_a - b128re_b;
4677        let out1im = b128im_a + b128im_b;
4678        let out2re = b227re_a - b227re_b;
4679        let out2im = b227im_a + b227im_b;
4680        let out3re = b326re_a - b326re_b;
4681        let out3im = b326im_a + b326im_b;
4682        let out4re = b425re_a - b425re_b;
4683        let out4im = b425im_a + b425im_b;
4684        let out5re = b524re_a - b524re_b;
4685        let out5im = b524im_a + b524im_b;
4686        let out6re = b623re_a - b623re_b;
4687        let out6im = b623im_a + b623im_b;
4688        let out7re = b722re_a - b722re_b;
4689        let out7im = b722im_a + b722im_b;
4690        let out8re = b821re_a - b821re_b;
4691        let out8im = b821im_a + b821im_b;
4692        let out9re = b920re_a - b920re_b;
4693        let out9im = b920im_a + b920im_b;
4694        let out10re = b1019re_a - b1019re_b;
4695        let out10im = b1019im_a + b1019im_b;
4696        let out11re = b1118re_a - b1118re_b;
4697        let out11im = b1118im_a + b1118im_b;
4698        let out12re = b1217re_a - b1217re_b;
4699        let out12im = b1217im_a + b1217im_b;
4700        let out13re = b1316re_a - b1316re_b;
4701        let out13im = b1316im_a + b1316im_b;
4702        let out14re = b1415re_a - b1415re_b;
4703        let out14im = b1415im_a + b1415im_b;
4704        let out15re = b1415re_a + b1415re_b;
4705        let out15im = b1415im_a - b1415im_b;
4706        let out16re = b1316re_a + b1316re_b;
4707        let out16im = b1316im_a - b1316im_b;
4708        let out17re = b1217re_a + b1217re_b;
4709        let out17im = b1217im_a - b1217im_b;
4710        let out18re = b1118re_a + b1118re_b;
4711        let out18im = b1118im_a - b1118im_b;
4712        let out19re = b1019re_a + b1019re_b;
4713        let out19im = b1019im_a - b1019im_b;
4714        let out20re = b920re_a + b920re_b;
4715        let out20im = b920im_a - b920im_b;
4716        let out21re = b821re_a + b821re_b;
4717        let out21im = b821im_a - b821im_b;
4718        let out22re = b722re_a + b722re_b;
4719        let out22im = b722im_a - b722im_b;
4720        let out23re = b623re_a + b623re_b;
4721        let out23im = b623im_a - b623im_b;
4722        let out24re = b524re_a + b524re_b;
4723        let out24im = b524im_a - b524im_b;
4724        let out25re = b425re_a + b425re_b;
4725        let out25im = b425im_a - b425im_b;
4726        let out26re = b326re_a + b326re_b;
4727        let out26im = b326im_a - b326im_b;
4728        let out27re = b227re_a + b227re_b;
4729        let out27im = b227im_a - b227im_b;
4730        let out28re = b128re_a + b128re_b;
4731        let out28im = b128im_a - b128im_b;
4732        buffer.store(sum, 0);
4733        buffer.store(
4734            Complex {
4735                re: out1re,
4736                im: out1im,
4737            },
4738            1,
4739        );
4740        buffer.store(
4741            Complex {
4742                re: out2re,
4743                im: out2im,
4744            },
4745            2,
4746        );
4747        buffer.store(
4748            Complex {
4749                re: out3re,
4750                im: out3im,
4751            },
4752            3,
4753        );
4754        buffer.store(
4755            Complex {
4756                re: out4re,
4757                im: out4im,
4758            },
4759            4,
4760        );
4761        buffer.store(
4762            Complex {
4763                re: out5re,
4764                im: out5im,
4765            },
4766            5,
4767        );
4768        buffer.store(
4769            Complex {
4770                re: out6re,
4771                im: out6im,
4772            },
4773            6,
4774        );
4775        buffer.store(
4776            Complex {
4777                re: out7re,
4778                im: out7im,
4779            },
4780            7,
4781        );
4782        buffer.store(
4783            Complex {
4784                re: out8re,
4785                im: out8im,
4786            },
4787            8,
4788        );
4789        buffer.store(
4790            Complex {
4791                re: out9re,
4792                im: out9im,
4793            },
4794            9,
4795        );
4796        buffer.store(
4797            Complex {
4798                re: out10re,
4799                im: out10im,
4800            },
4801            10,
4802        );
4803        buffer.store(
4804            Complex {
4805                re: out11re,
4806                im: out11im,
4807            },
4808            11,
4809        );
4810        buffer.store(
4811            Complex {
4812                re: out12re,
4813                im: out12im,
4814            },
4815            12,
4816        );
4817        buffer.store(
4818            Complex {
4819                re: out13re,
4820                im: out13im,
4821            },
4822            13,
4823        );
4824        buffer.store(
4825            Complex {
4826                re: out14re,
4827                im: out14im,
4828            },
4829            14,
4830        );
4831        buffer.store(
4832            Complex {
4833                re: out15re,
4834                im: out15im,
4835            },
4836            15,
4837        );
4838        buffer.store(
4839            Complex {
4840                re: out16re,
4841                im: out16im,
4842            },
4843            16,
4844        );
4845        buffer.store(
4846            Complex {
4847                re: out17re,
4848                im: out17im,
4849            },
4850            17,
4851        );
4852        buffer.store(
4853            Complex {
4854                re: out18re,
4855                im: out18im,
4856            },
4857            18,
4858        );
4859        buffer.store(
4860            Complex {
4861                re: out19re,
4862                im: out19im,
4863            },
4864            19,
4865        );
4866        buffer.store(
4867            Complex {
4868                re: out20re,
4869                im: out20im,
4870            },
4871            20,
4872        );
4873        buffer.store(
4874            Complex {
4875                re: out21re,
4876                im: out21im,
4877            },
4878            21,
4879        );
4880        buffer.store(
4881            Complex {
4882                re: out22re,
4883                im: out22im,
4884            },
4885            22,
4886        );
4887        buffer.store(
4888            Complex {
4889                re: out23re,
4890                im: out23im,
4891            },
4892            23,
4893        );
4894        buffer.store(
4895            Complex {
4896                re: out24re,
4897                im: out24im,
4898            },
4899            24,
4900        );
4901        buffer.store(
4902            Complex {
4903                re: out25re,
4904                im: out25im,
4905            },
4906            25,
4907        );
4908        buffer.store(
4909            Complex {
4910                re: out26re,
4911                im: out26im,
4912            },
4913            26,
4914        );
4915        buffer.store(
4916            Complex {
4917                re: out27re,
4918                im: out27im,
4919            },
4920            27,
4921        );
4922        buffer.store(
4923            Complex {
4924                re: out28re,
4925                im: out28im,
4926            },
4927            28,
4928        );
4929    }
4930}
4931pub struct Butterfly31<T> {
4932    twiddle1: Complex<T>,
4933    twiddle2: Complex<T>,
4934    twiddle3: Complex<T>,
4935    twiddle4: Complex<T>,
4936    twiddle5: Complex<T>,
4937    twiddle6: Complex<T>,
4938    twiddle7: Complex<T>,
4939    twiddle8: Complex<T>,
4940    twiddle9: Complex<T>,
4941    twiddle10: Complex<T>,
4942    twiddle11: Complex<T>,
4943    twiddle12: Complex<T>,
4944    twiddle13: Complex<T>,
4945    twiddle14: Complex<T>,
4946    twiddle15: Complex<T>,
4947    direction: FftDirection,
4948}
4949boilerplate_fft_butterfly!(Butterfly31, 31, |this: &Butterfly31<_>| this.direction);
4950impl<T: FftNum> Butterfly31<T> {
4951    pub fn new(direction: FftDirection) -> Self {
4952        let twiddle1: Complex<T> = twiddles::compute_twiddle(1, 31, direction);
4953        let twiddle2: Complex<T> = twiddles::compute_twiddle(2, 31, direction);
4954        let twiddle3: Complex<T> = twiddles::compute_twiddle(3, 31, direction);
4955        let twiddle4: Complex<T> = twiddles::compute_twiddle(4, 31, direction);
4956        let twiddle5: Complex<T> = twiddles::compute_twiddle(5, 31, direction);
4957        let twiddle6: Complex<T> = twiddles::compute_twiddle(6, 31, direction);
4958        let twiddle7: Complex<T> = twiddles::compute_twiddle(7, 31, direction);
4959        let twiddle8: Complex<T> = twiddles::compute_twiddle(8, 31, direction);
4960        let twiddle9: Complex<T> = twiddles::compute_twiddle(9, 31, direction);
4961        let twiddle10: Complex<T> = twiddles::compute_twiddle(10, 31, direction);
4962        let twiddle11: Complex<T> = twiddles::compute_twiddle(11, 31, direction);
4963        let twiddle12: Complex<T> = twiddles::compute_twiddle(12, 31, direction);
4964        let twiddle13: Complex<T> = twiddles::compute_twiddle(13, 31, direction);
4965        let twiddle14: Complex<T> = twiddles::compute_twiddle(14, 31, direction);
4966        let twiddle15: Complex<T> = twiddles::compute_twiddle(15, 31, direction);
4967        Self {
4968            twiddle1,
4969            twiddle2,
4970            twiddle3,
4971            twiddle4,
4972            twiddle5,
4973            twiddle6,
4974            twiddle7,
4975            twiddle8,
4976            twiddle9,
4977            twiddle10,
4978            twiddle11,
4979            twiddle12,
4980            twiddle13,
4981            twiddle14,
4982            twiddle15,
4983            direction,
4984        }
4985    }
4986
4987    #[inline(never)]
4988    unsafe fn perform_fft_contiguous(&self, mut buffer: impl LoadStore<T>) {
4989        // This function was derived in the same manner as the butterflies for length 3, 5 and 7.
4990        // However, instead of doing it by hand the actual code is autogenerated
4991        // with the `genbutterflies.py` script in the `tools` directory.
4992        let x130p = buffer.load(1) + buffer.load(30);
4993        let x130n = buffer.load(1) - buffer.load(30);
4994        let x229p = buffer.load(2) + buffer.load(29);
4995        let x229n = buffer.load(2) - buffer.load(29);
4996        let x328p = buffer.load(3) + buffer.load(28);
4997        let x328n = buffer.load(3) - buffer.load(28);
4998        let x427p = buffer.load(4) + buffer.load(27);
4999        let x427n = buffer.load(4) - buffer.load(27);
5000        let x526p = buffer.load(5) + buffer.load(26);
5001        let x526n = buffer.load(5) - buffer.load(26);
5002        let x625p = buffer.load(6) + buffer.load(25);
5003        let x625n = buffer.load(6) - buffer.load(25);
5004        let x724p = buffer.load(7) + buffer.load(24);
5005        let x724n = buffer.load(7) - buffer.load(24);
5006        let x823p = buffer.load(8) + buffer.load(23);
5007        let x823n = buffer.load(8) - buffer.load(23);
5008        let x922p = buffer.load(9) + buffer.load(22);
5009        let x922n = buffer.load(9) - buffer.load(22);
5010        let x1021p = buffer.load(10) + buffer.load(21);
5011        let x1021n = buffer.load(10) - buffer.load(21);
5012        let x1120p = buffer.load(11) + buffer.load(20);
5013        let x1120n = buffer.load(11) - buffer.load(20);
5014        let x1219p = buffer.load(12) + buffer.load(19);
5015        let x1219n = buffer.load(12) - buffer.load(19);
5016        let x1318p = buffer.load(13) + buffer.load(18);
5017        let x1318n = buffer.load(13) - buffer.load(18);
5018        let x1417p = buffer.load(14) + buffer.load(17);
5019        let x1417n = buffer.load(14) - buffer.load(17);
5020        let x1516p = buffer.load(15) + buffer.load(16);
5021        let x1516n = buffer.load(15) - buffer.load(16);
5022        let sum = buffer.load(0)
5023            + x130p
5024            + x229p
5025            + x328p
5026            + x427p
5027            + x526p
5028            + x625p
5029            + x724p
5030            + x823p
5031            + x922p
5032            + x1021p
5033            + x1120p
5034            + x1219p
5035            + x1318p
5036            + x1417p
5037            + x1516p;
5038        let b130re_a = buffer.load(0).re
5039            + self.twiddle1.re * x130p.re
5040            + self.twiddle2.re * x229p.re
5041            + self.twiddle3.re * x328p.re
5042            + self.twiddle4.re * x427p.re
5043            + self.twiddle5.re * x526p.re
5044            + self.twiddle6.re * x625p.re
5045            + self.twiddle7.re * x724p.re
5046            + self.twiddle8.re * x823p.re
5047            + self.twiddle9.re * x922p.re
5048            + self.twiddle10.re * x1021p.re
5049            + self.twiddle11.re * x1120p.re
5050            + self.twiddle12.re * x1219p.re
5051            + self.twiddle13.re * x1318p.re
5052            + self.twiddle14.re * x1417p.re
5053            + self.twiddle15.re * x1516p.re;
5054        let b130re_b = self.twiddle1.im * x130n.im
5055            + self.twiddle2.im * x229n.im
5056            + self.twiddle3.im * x328n.im
5057            + self.twiddle4.im * x427n.im
5058            + self.twiddle5.im * x526n.im
5059            + self.twiddle6.im * x625n.im
5060            + self.twiddle7.im * x724n.im
5061            + self.twiddle8.im * x823n.im
5062            + self.twiddle9.im * x922n.im
5063            + self.twiddle10.im * x1021n.im
5064            + self.twiddle11.im * x1120n.im
5065            + self.twiddle12.im * x1219n.im
5066            + self.twiddle13.im * x1318n.im
5067            + self.twiddle14.im * x1417n.im
5068            + self.twiddle15.im * x1516n.im;
5069        let b229re_a = buffer.load(0).re
5070            + self.twiddle2.re * x130p.re
5071            + self.twiddle4.re * x229p.re
5072            + self.twiddle6.re * x328p.re
5073            + self.twiddle8.re * x427p.re
5074            + self.twiddle10.re * x526p.re
5075            + self.twiddle12.re * x625p.re
5076            + self.twiddle14.re * x724p.re
5077            + self.twiddle15.re * x823p.re
5078            + self.twiddle13.re * x922p.re
5079            + self.twiddle11.re * x1021p.re
5080            + self.twiddle9.re * x1120p.re
5081            + self.twiddle7.re * x1219p.re
5082            + self.twiddle5.re * x1318p.re
5083            + self.twiddle3.re * x1417p.re
5084            + self.twiddle1.re * x1516p.re;
5085        let b229re_b = self.twiddle2.im * x130n.im
5086            + self.twiddle4.im * x229n.im
5087            + self.twiddle6.im * x328n.im
5088            + self.twiddle8.im * x427n.im
5089            + self.twiddle10.im * x526n.im
5090            + self.twiddle12.im * x625n.im
5091            + self.twiddle14.im * x724n.im
5092            + -self.twiddle15.im * x823n.im
5093            + -self.twiddle13.im * x922n.im
5094            + -self.twiddle11.im * x1021n.im
5095            + -self.twiddle9.im * x1120n.im
5096            + -self.twiddle7.im * x1219n.im
5097            + -self.twiddle5.im * x1318n.im
5098            + -self.twiddle3.im * x1417n.im
5099            + -self.twiddle1.im * x1516n.im;
5100        let b328re_a = buffer.load(0).re
5101            + self.twiddle3.re * x130p.re
5102            + self.twiddle6.re * x229p.re
5103            + self.twiddle9.re * x328p.re
5104            + self.twiddle12.re * x427p.re
5105            + self.twiddle15.re * x526p.re
5106            + self.twiddle13.re * x625p.re
5107            + self.twiddle10.re * x724p.re
5108            + self.twiddle7.re * x823p.re
5109            + self.twiddle4.re * x922p.re
5110            + self.twiddle1.re * x1021p.re
5111            + self.twiddle2.re * x1120p.re
5112            + self.twiddle5.re * x1219p.re
5113            + self.twiddle8.re * x1318p.re
5114            + self.twiddle11.re * x1417p.re
5115            + self.twiddle14.re * x1516p.re;
5116        let b328re_b = self.twiddle3.im * x130n.im
5117            + self.twiddle6.im * x229n.im
5118            + self.twiddle9.im * x328n.im
5119            + self.twiddle12.im * x427n.im
5120            + self.twiddle15.im * x526n.im
5121            + -self.twiddle13.im * x625n.im
5122            + -self.twiddle10.im * x724n.im
5123            + -self.twiddle7.im * x823n.im
5124            + -self.twiddle4.im * x922n.im
5125            + -self.twiddle1.im * x1021n.im
5126            + self.twiddle2.im * x1120n.im
5127            + self.twiddle5.im * x1219n.im
5128            + self.twiddle8.im * x1318n.im
5129            + self.twiddle11.im * x1417n.im
5130            + self.twiddle14.im * x1516n.im;
5131        let b427re_a = buffer.load(0).re
5132            + self.twiddle4.re * x130p.re
5133            + self.twiddle8.re * x229p.re
5134            + self.twiddle12.re * x328p.re
5135            + self.twiddle15.re * x427p.re
5136            + self.twiddle11.re * x526p.re
5137            + self.twiddle7.re * x625p.re
5138            + self.twiddle3.re * x724p.re
5139            + self.twiddle1.re * x823p.re
5140            + self.twiddle5.re * x922p.re
5141            + self.twiddle9.re * x1021p.re
5142            + self.twiddle13.re * x1120p.re
5143            + self.twiddle14.re * x1219p.re
5144            + self.twiddle10.re * x1318p.re
5145            + self.twiddle6.re * x1417p.re
5146            + self.twiddle2.re * x1516p.re;
5147        let b427re_b = self.twiddle4.im * x130n.im
5148            + self.twiddle8.im * x229n.im
5149            + self.twiddle12.im * x328n.im
5150            + -self.twiddle15.im * x427n.im
5151            + -self.twiddle11.im * x526n.im
5152            + -self.twiddle7.im * x625n.im
5153            + -self.twiddle3.im * x724n.im
5154            + self.twiddle1.im * x823n.im
5155            + self.twiddle5.im * x922n.im
5156            + self.twiddle9.im * x1021n.im
5157            + self.twiddle13.im * x1120n.im
5158            + -self.twiddle14.im * x1219n.im
5159            + -self.twiddle10.im * x1318n.im
5160            + -self.twiddle6.im * x1417n.im
5161            + -self.twiddle2.im * x1516n.im;
5162        let b526re_a = buffer.load(0).re
5163            + self.twiddle5.re * x130p.re
5164            + self.twiddle10.re * x229p.re
5165            + self.twiddle15.re * x328p.re
5166            + self.twiddle11.re * x427p.re
5167            + self.twiddle6.re * x526p.re
5168            + self.twiddle1.re * x625p.re
5169            + self.twiddle4.re * x724p.re
5170            + self.twiddle9.re * x823p.re
5171            + self.twiddle14.re * x922p.re
5172            + self.twiddle12.re * x1021p.re
5173            + self.twiddle7.re * x1120p.re
5174            + self.twiddle2.re * x1219p.re
5175            + self.twiddle3.re * x1318p.re
5176            + self.twiddle8.re * x1417p.re
5177            + self.twiddle13.re * x1516p.re;
5178        let b526re_b = self.twiddle5.im * x130n.im
5179            + self.twiddle10.im * x229n.im
5180            + self.twiddle15.im * x328n.im
5181            + -self.twiddle11.im * x427n.im
5182            + -self.twiddle6.im * x526n.im
5183            + -self.twiddle1.im * x625n.im
5184            + self.twiddle4.im * x724n.im
5185            + self.twiddle9.im * x823n.im
5186            + self.twiddle14.im * x922n.im
5187            + -self.twiddle12.im * x1021n.im
5188            + -self.twiddle7.im * x1120n.im
5189            + -self.twiddle2.im * x1219n.im
5190            + self.twiddle3.im * x1318n.im
5191            + self.twiddle8.im * x1417n.im
5192            + self.twiddle13.im * x1516n.im;
5193        let b625re_a = buffer.load(0).re
5194            + self.twiddle6.re * x130p.re
5195            + self.twiddle12.re * x229p.re
5196            + self.twiddle13.re * x328p.re
5197            + self.twiddle7.re * x427p.re
5198            + self.twiddle1.re * x526p.re
5199            + self.twiddle5.re * x625p.re
5200            + self.twiddle11.re * x724p.re
5201            + self.twiddle14.re * x823p.re
5202            + self.twiddle8.re * x922p.re
5203            + self.twiddle2.re * x1021p.re
5204            + self.twiddle4.re * x1120p.re
5205            + self.twiddle10.re * x1219p.re
5206            + self.twiddle15.re * x1318p.re
5207            + self.twiddle9.re * x1417p.re
5208            + self.twiddle3.re * x1516p.re;
5209        let b625re_b = self.twiddle6.im * x130n.im
5210            + self.twiddle12.im * x229n.im
5211            + -self.twiddle13.im * x328n.im
5212            + -self.twiddle7.im * x427n.im
5213            + -self.twiddle1.im * x526n.im
5214            + self.twiddle5.im * x625n.im
5215            + self.twiddle11.im * x724n.im
5216            + -self.twiddle14.im * x823n.im
5217            + -self.twiddle8.im * x922n.im
5218            + -self.twiddle2.im * x1021n.im
5219            + self.twiddle4.im * x1120n.im
5220            + self.twiddle10.im * x1219n.im
5221            + -self.twiddle15.im * x1318n.im
5222            + -self.twiddle9.im * x1417n.im
5223            + -self.twiddle3.im * x1516n.im;
5224        let b724re_a = buffer.load(0).re
5225            + self.twiddle7.re * x130p.re
5226            + self.twiddle14.re * x229p.re
5227            + self.twiddle10.re * x328p.re
5228            + self.twiddle3.re * x427p.re
5229            + self.twiddle4.re * x526p.re
5230            + self.twiddle11.re * x625p.re
5231            + self.twiddle13.re * x724p.re
5232            + self.twiddle6.re * x823p.re
5233            + self.twiddle1.re * x922p.re
5234            + self.twiddle8.re * x1021p.re
5235            + self.twiddle15.re * x1120p.re
5236            + self.twiddle9.re * x1219p.re
5237            + self.twiddle2.re * x1318p.re
5238            + self.twiddle5.re * x1417p.re
5239            + self.twiddle12.re * x1516p.re;
5240        let b724re_b = self.twiddle7.im * x130n.im
5241            + self.twiddle14.im * x229n.im
5242            + -self.twiddle10.im * x328n.im
5243            + -self.twiddle3.im * x427n.im
5244            + self.twiddle4.im * x526n.im
5245            + self.twiddle11.im * x625n.im
5246            + -self.twiddle13.im * x724n.im
5247            + -self.twiddle6.im * x823n.im
5248            + self.twiddle1.im * x922n.im
5249            + self.twiddle8.im * x1021n.im
5250            + self.twiddle15.im * x1120n.im
5251            + -self.twiddle9.im * x1219n.im
5252            + -self.twiddle2.im * x1318n.im
5253            + self.twiddle5.im * x1417n.im
5254            + self.twiddle12.im * x1516n.im;
5255        let b823re_a = buffer.load(0).re
5256            + self.twiddle8.re * x130p.re
5257            + self.twiddle15.re * x229p.re
5258            + self.twiddle7.re * x328p.re
5259            + self.twiddle1.re * x427p.re
5260            + self.twiddle9.re * x526p.re
5261            + self.twiddle14.re * x625p.re
5262            + self.twiddle6.re * x724p.re
5263            + self.twiddle2.re * x823p.re
5264            + self.twiddle10.re * x922p.re
5265            + self.twiddle13.re * x1021p.re
5266            + self.twiddle5.re * x1120p.re
5267            + self.twiddle3.re * x1219p.re
5268            + self.twiddle11.re * x1318p.re
5269            + self.twiddle12.re * x1417p.re
5270            + self.twiddle4.re * x1516p.re;
5271        let b823re_b = self.twiddle8.im * x130n.im
5272            + -self.twiddle15.im * x229n.im
5273            + -self.twiddle7.im * x328n.im
5274            + self.twiddle1.im * x427n.im
5275            + self.twiddle9.im * x526n.im
5276            + -self.twiddle14.im * x625n.im
5277            + -self.twiddle6.im * x724n.im
5278            + self.twiddle2.im * x823n.im
5279            + self.twiddle10.im * x922n.im
5280            + -self.twiddle13.im * x1021n.im
5281            + -self.twiddle5.im * x1120n.im
5282            + self.twiddle3.im * x1219n.im
5283            + self.twiddle11.im * x1318n.im
5284            + -self.twiddle12.im * x1417n.im
5285            + -self.twiddle4.im * x1516n.im;
5286        let b922re_a = buffer.load(0).re
5287            + self.twiddle9.re * x130p.re
5288            + self.twiddle13.re * x229p.re
5289            + self.twiddle4.re * x328p.re
5290            + self.twiddle5.re * x427p.re
5291            + self.twiddle14.re * x526p.re
5292            + self.twiddle8.re * x625p.re
5293            + self.twiddle1.re * x724p.re
5294            + self.twiddle10.re * x823p.re
5295            + self.twiddle12.re * x922p.re
5296            + self.twiddle3.re * x1021p.re
5297            + self.twiddle6.re * x1120p.re
5298            + self.twiddle15.re * x1219p.re
5299            + self.twiddle7.re * x1318p.re
5300            + self.twiddle2.re * x1417p.re
5301            + self.twiddle11.re * x1516p.re;
5302        let b922re_b = self.twiddle9.im * x130n.im
5303            + -self.twiddle13.im * x229n.im
5304            + -self.twiddle4.im * x328n.im
5305            + self.twiddle5.im * x427n.im
5306            + self.twiddle14.im * x526n.im
5307            + -self.twiddle8.im * x625n.im
5308            + self.twiddle1.im * x724n.im
5309            + self.twiddle10.im * x823n.im
5310            + -self.twiddle12.im * x922n.im
5311            + -self.twiddle3.im * x1021n.im
5312            + self.twiddle6.im * x1120n.im
5313            + self.twiddle15.im * x1219n.im
5314            + -self.twiddle7.im * x1318n.im
5315            + self.twiddle2.im * x1417n.im
5316            + self.twiddle11.im * x1516n.im;
5317        let b1021re_a = buffer.load(0).re
5318            + self.twiddle10.re * x130p.re
5319            + self.twiddle11.re * x229p.re
5320            + self.twiddle1.re * x328p.re
5321            + self.twiddle9.re * x427p.re
5322            + self.twiddle12.re * x526p.re
5323            + self.twiddle2.re * x625p.re
5324            + self.twiddle8.re * x724p.re
5325            + self.twiddle13.re * x823p.re
5326            + self.twiddle3.re * x922p.re
5327            + self.twiddle7.re * x1021p.re
5328            + self.twiddle14.re * x1120p.re
5329            + self.twiddle4.re * x1219p.re
5330            + self.twiddle6.re * x1318p.re
5331            + self.twiddle15.re * x1417p.re
5332            + self.twiddle5.re * x1516p.re;
5333        let b1021re_b = self.twiddle10.im * x130n.im
5334            + -self.twiddle11.im * x229n.im
5335            + -self.twiddle1.im * x328n.im
5336            + self.twiddle9.im * x427n.im
5337            + -self.twiddle12.im * x526n.im
5338            + -self.twiddle2.im * x625n.im
5339            + self.twiddle8.im * x724n.im
5340            + -self.twiddle13.im * x823n.im
5341            + -self.twiddle3.im * x922n.im
5342            + self.twiddle7.im * x1021n.im
5343            + -self.twiddle14.im * x1120n.im
5344            + -self.twiddle4.im * x1219n.im
5345            + self.twiddle6.im * x1318n.im
5346            + -self.twiddle15.im * x1417n.im
5347            + -self.twiddle5.im * x1516n.im;
5348        let b1120re_a = buffer.load(0).re
5349            + self.twiddle11.re * x130p.re
5350            + self.twiddle9.re * x229p.re
5351            + self.twiddle2.re * x328p.re
5352            + self.twiddle13.re * x427p.re
5353            + self.twiddle7.re * x526p.re
5354            + self.twiddle4.re * x625p.re
5355            + self.twiddle15.re * x724p.re
5356            + self.twiddle5.re * x823p.re
5357            + self.twiddle6.re * x922p.re
5358            + self.twiddle14.re * x1021p.re
5359            + self.twiddle3.re * x1120p.re
5360            + self.twiddle8.re * x1219p.re
5361            + self.twiddle12.re * x1318p.re
5362            + self.twiddle1.re * x1417p.re
5363            + self.twiddle10.re * x1516p.re;
5364        let b1120re_b = self.twiddle11.im * x130n.im
5365            + -self.twiddle9.im * x229n.im
5366            + self.twiddle2.im * x328n.im
5367            + self.twiddle13.im * x427n.im
5368            + -self.twiddle7.im * x526n.im
5369            + self.twiddle4.im * x625n.im
5370            + self.twiddle15.im * x724n.im
5371            + -self.twiddle5.im * x823n.im
5372            + self.twiddle6.im * x922n.im
5373            + -self.twiddle14.im * x1021n.im
5374            + -self.twiddle3.im * x1120n.im
5375            + self.twiddle8.im * x1219n.im
5376            + -self.twiddle12.im * x1318n.im
5377            + -self.twiddle1.im * x1417n.im
5378            + self.twiddle10.im * x1516n.im;
5379        let b1219re_a = buffer.load(0).re
5380            + self.twiddle12.re * x130p.re
5381            + self.twiddle7.re * x229p.re
5382            + self.twiddle5.re * x328p.re
5383            + self.twiddle14.re * x427p.re
5384            + self.twiddle2.re * x526p.re
5385            + self.twiddle10.re * x625p.re
5386            + self.twiddle9.re * x724p.re
5387            + self.twiddle3.re * x823p.re
5388            + self.twiddle15.re * x922p.re
5389            + self.twiddle4.re * x1021p.re
5390            + self.twiddle8.re * x1120p.re
5391            + self.twiddle11.re * x1219p.re
5392            + self.twiddle1.re * x1318p.re
5393            + self.twiddle13.re * x1417p.re
5394            + self.twiddle6.re * x1516p.re;
5395        let b1219re_b = self.twiddle12.im * x130n.im
5396            + -self.twiddle7.im * x229n.im
5397            + self.twiddle5.im * x328n.im
5398            + -self.twiddle14.im * x427n.im
5399            + -self.twiddle2.im * x526n.im
5400            + self.twiddle10.im * x625n.im
5401            + -self.twiddle9.im * x724n.im
5402            + self.twiddle3.im * x823n.im
5403            + self.twiddle15.im * x922n.im
5404            + -self.twiddle4.im * x1021n.im
5405            + self.twiddle8.im * x1120n.im
5406            + -self.twiddle11.im * x1219n.im
5407            + self.twiddle1.im * x1318n.im
5408            + self.twiddle13.im * x1417n.im
5409            + -self.twiddle6.im * x1516n.im;
5410        let b1318re_a = buffer.load(0).re
5411            + self.twiddle13.re * x130p.re
5412            + self.twiddle5.re * x229p.re
5413            + self.twiddle8.re * x328p.re
5414            + self.twiddle10.re * x427p.re
5415            + self.twiddle3.re * x526p.re
5416            + self.twiddle15.re * x625p.re
5417            + self.twiddle2.re * x724p.re
5418            + self.twiddle11.re * x823p.re
5419            + self.twiddle7.re * x922p.re
5420            + self.twiddle6.re * x1021p.re
5421            + self.twiddle12.re * x1120p.re
5422            + self.twiddle1.re * x1219p.re
5423            + self.twiddle14.re * x1318p.re
5424            + self.twiddle4.re * x1417p.re
5425            + self.twiddle9.re * x1516p.re;
5426        let b1318re_b = self.twiddle13.im * x130n.im
5427            + -self.twiddle5.im * x229n.im
5428            + self.twiddle8.im * x328n.im
5429            + -self.twiddle10.im * x427n.im
5430            + self.twiddle3.im * x526n.im
5431            + -self.twiddle15.im * x625n.im
5432            + -self.twiddle2.im * x724n.im
5433            + self.twiddle11.im * x823n.im
5434            + -self.twiddle7.im * x922n.im
5435            + self.twiddle6.im * x1021n.im
5436            + -self.twiddle12.im * x1120n.im
5437            + self.twiddle1.im * x1219n.im
5438            + self.twiddle14.im * x1318n.im
5439            + -self.twiddle4.im * x1417n.im
5440            + self.twiddle9.im * x1516n.im;
5441        let b1417re_a = buffer.load(0).re
5442            + self.twiddle14.re * x130p.re
5443            + self.twiddle3.re * x229p.re
5444            + self.twiddle11.re * x328p.re
5445            + self.twiddle6.re * x427p.re
5446            + self.twiddle8.re * x526p.re
5447            + self.twiddle9.re * x625p.re
5448            + self.twiddle5.re * x724p.re
5449            + self.twiddle12.re * x823p.re
5450            + self.twiddle2.re * x922p.re
5451            + self.twiddle15.re * x1021p.re
5452            + self.twiddle1.re * x1120p.re
5453            + self.twiddle13.re * x1219p.re
5454            + self.twiddle4.re * x1318p.re
5455            + self.twiddle10.re * x1417p.re
5456            + self.twiddle7.re * x1516p.re;
5457        let b1417re_b = self.twiddle14.im * x130n.im
5458            + -self.twiddle3.im * x229n.im
5459            + self.twiddle11.im * x328n.im
5460            + -self.twiddle6.im * x427n.im
5461            + self.twiddle8.im * x526n.im
5462            + -self.twiddle9.im * x625n.im
5463            + self.twiddle5.im * x724n.im
5464            + -self.twiddle12.im * x823n.im
5465            + self.twiddle2.im * x922n.im
5466            + -self.twiddle15.im * x1021n.im
5467            + -self.twiddle1.im * x1120n.im
5468            + self.twiddle13.im * x1219n.im
5469            + -self.twiddle4.im * x1318n.im
5470            + self.twiddle10.im * x1417n.im
5471            + -self.twiddle7.im * x1516n.im;
5472        let b1516re_a = buffer.load(0).re
5473            + self.twiddle15.re * x130p.re
5474            + self.twiddle1.re * x229p.re
5475            + self.twiddle14.re * x328p.re
5476            + self.twiddle2.re * x427p.re
5477            + self.twiddle13.re * x526p.re
5478            + self.twiddle3.re * x625p.re
5479            + self.twiddle12.re * x724p.re
5480            + self.twiddle4.re * x823p.re
5481            + self.twiddle11.re * x922p.re
5482            + self.twiddle5.re * x1021p.re
5483            + self.twiddle10.re * x1120p.re
5484            + self.twiddle6.re * x1219p.re
5485            + self.twiddle9.re * x1318p.re
5486            + self.twiddle7.re * x1417p.re
5487            + self.twiddle8.re * x1516p.re;
5488        let b1516re_b = self.twiddle15.im * x130n.im
5489            + -self.twiddle1.im * x229n.im
5490            + self.twiddle14.im * x328n.im
5491            + -self.twiddle2.im * x427n.im
5492            + self.twiddle13.im * x526n.im
5493            + -self.twiddle3.im * x625n.im
5494            + self.twiddle12.im * x724n.im
5495            + -self.twiddle4.im * x823n.im
5496            + self.twiddle11.im * x922n.im
5497            + -self.twiddle5.im * x1021n.im
5498            + self.twiddle10.im * x1120n.im
5499            + -self.twiddle6.im * x1219n.im
5500            + self.twiddle9.im * x1318n.im
5501            + -self.twiddle7.im * x1417n.im
5502            + self.twiddle8.im * x1516n.im;
5503
5504        let b130im_a = buffer.load(0).im
5505            + self.twiddle1.re * x130p.im
5506            + self.twiddle2.re * x229p.im
5507            + self.twiddle3.re * x328p.im
5508            + self.twiddle4.re * x427p.im
5509            + self.twiddle5.re * x526p.im
5510            + self.twiddle6.re * x625p.im
5511            + self.twiddle7.re * x724p.im
5512            + self.twiddle8.re * x823p.im
5513            + self.twiddle9.re * x922p.im
5514            + self.twiddle10.re * x1021p.im
5515            + self.twiddle11.re * x1120p.im
5516            + self.twiddle12.re * x1219p.im
5517            + self.twiddle13.re * x1318p.im
5518            + self.twiddle14.re * x1417p.im
5519            + self.twiddle15.re * x1516p.im;
5520        let b130im_b = self.twiddle1.im * x130n.re
5521            + self.twiddle2.im * x229n.re
5522            + self.twiddle3.im * x328n.re
5523            + self.twiddle4.im * x427n.re
5524            + self.twiddle5.im * x526n.re
5525            + self.twiddle6.im * x625n.re
5526            + self.twiddle7.im * x724n.re
5527            + self.twiddle8.im * x823n.re
5528            + self.twiddle9.im * x922n.re
5529            + self.twiddle10.im * x1021n.re
5530            + self.twiddle11.im * x1120n.re
5531            + self.twiddle12.im * x1219n.re
5532            + self.twiddle13.im * x1318n.re
5533            + self.twiddle14.im * x1417n.re
5534            + self.twiddle15.im * x1516n.re;
5535        let b229im_a = buffer.load(0).im
5536            + self.twiddle2.re * x130p.im
5537            + self.twiddle4.re * x229p.im
5538            + self.twiddle6.re * x328p.im
5539            + self.twiddle8.re * x427p.im
5540            + self.twiddle10.re * x526p.im
5541            + self.twiddle12.re * x625p.im
5542            + self.twiddle14.re * x724p.im
5543            + self.twiddle15.re * x823p.im
5544            + self.twiddle13.re * x922p.im
5545            + self.twiddle11.re * x1021p.im
5546            + self.twiddle9.re * x1120p.im
5547            + self.twiddle7.re * x1219p.im
5548            + self.twiddle5.re * x1318p.im
5549            + self.twiddle3.re * x1417p.im
5550            + self.twiddle1.re * x1516p.im;
5551        let b229im_b = self.twiddle2.im * x130n.re
5552            + self.twiddle4.im * x229n.re
5553            + self.twiddle6.im * x328n.re
5554            + self.twiddle8.im * x427n.re
5555            + self.twiddle10.im * x526n.re
5556            + self.twiddle12.im * x625n.re
5557            + self.twiddle14.im * x724n.re
5558            + -self.twiddle15.im * x823n.re
5559            + -self.twiddle13.im * x922n.re
5560            + -self.twiddle11.im * x1021n.re
5561            + -self.twiddle9.im * x1120n.re
5562            + -self.twiddle7.im * x1219n.re
5563            + -self.twiddle5.im * x1318n.re
5564            + -self.twiddle3.im * x1417n.re
5565            + -self.twiddle1.im * x1516n.re;
5566        let b328im_a = buffer.load(0).im
5567            + self.twiddle3.re * x130p.im
5568            + self.twiddle6.re * x229p.im
5569            + self.twiddle9.re * x328p.im
5570            + self.twiddle12.re * x427p.im
5571            + self.twiddle15.re * x526p.im
5572            + self.twiddle13.re * x625p.im
5573            + self.twiddle10.re * x724p.im
5574            + self.twiddle7.re * x823p.im
5575            + self.twiddle4.re * x922p.im
5576            + self.twiddle1.re * x1021p.im
5577            + self.twiddle2.re * x1120p.im
5578            + self.twiddle5.re * x1219p.im
5579            + self.twiddle8.re * x1318p.im
5580            + self.twiddle11.re * x1417p.im
5581            + self.twiddle14.re * x1516p.im;
5582        let b328im_b = self.twiddle3.im * x130n.re
5583            + self.twiddle6.im * x229n.re
5584            + self.twiddle9.im * x328n.re
5585            + self.twiddle12.im * x427n.re
5586            + self.twiddle15.im * x526n.re
5587            + -self.twiddle13.im * x625n.re
5588            + -self.twiddle10.im * x724n.re
5589            + -self.twiddle7.im * x823n.re
5590            + -self.twiddle4.im * x922n.re
5591            + -self.twiddle1.im * x1021n.re
5592            + self.twiddle2.im * x1120n.re
5593            + self.twiddle5.im * x1219n.re
5594            + self.twiddle8.im * x1318n.re
5595            + self.twiddle11.im * x1417n.re
5596            + self.twiddle14.im * x1516n.re;
5597        let b427im_a = buffer.load(0).im
5598            + self.twiddle4.re * x130p.im
5599            + self.twiddle8.re * x229p.im
5600            + self.twiddle12.re * x328p.im
5601            + self.twiddle15.re * x427p.im
5602            + self.twiddle11.re * x526p.im
5603            + self.twiddle7.re * x625p.im
5604            + self.twiddle3.re * x724p.im
5605            + self.twiddle1.re * x823p.im
5606            + self.twiddle5.re * x922p.im
5607            + self.twiddle9.re * x1021p.im
5608            + self.twiddle13.re * x1120p.im
5609            + self.twiddle14.re * x1219p.im
5610            + self.twiddle10.re * x1318p.im
5611            + self.twiddle6.re * x1417p.im
5612            + self.twiddle2.re * x1516p.im;
5613        let b427im_b = self.twiddle4.im * x130n.re
5614            + self.twiddle8.im * x229n.re
5615            + self.twiddle12.im * x328n.re
5616            + -self.twiddle15.im * x427n.re
5617            + -self.twiddle11.im * x526n.re
5618            + -self.twiddle7.im * x625n.re
5619            + -self.twiddle3.im * x724n.re
5620            + self.twiddle1.im * x823n.re
5621            + self.twiddle5.im * x922n.re
5622            + self.twiddle9.im * x1021n.re
5623            + self.twiddle13.im * x1120n.re
5624            + -self.twiddle14.im * x1219n.re
5625            + -self.twiddle10.im * x1318n.re
5626            + -self.twiddle6.im * x1417n.re
5627            + -self.twiddle2.im * x1516n.re;
5628        let b526im_a = buffer.load(0).im
5629            + self.twiddle5.re * x130p.im
5630            + self.twiddle10.re * x229p.im
5631            + self.twiddle15.re * x328p.im
5632            + self.twiddle11.re * x427p.im
5633            + self.twiddle6.re * x526p.im
5634            + self.twiddle1.re * x625p.im
5635            + self.twiddle4.re * x724p.im
5636            + self.twiddle9.re * x823p.im
5637            + self.twiddle14.re * x922p.im
5638            + self.twiddle12.re * x1021p.im
5639            + self.twiddle7.re * x1120p.im
5640            + self.twiddle2.re * x1219p.im
5641            + self.twiddle3.re * x1318p.im
5642            + self.twiddle8.re * x1417p.im
5643            + self.twiddle13.re * x1516p.im;
5644        let b526im_b = self.twiddle5.im * x130n.re
5645            + self.twiddle10.im * x229n.re
5646            + self.twiddle15.im * x328n.re
5647            + -self.twiddle11.im * x427n.re
5648            + -self.twiddle6.im * x526n.re
5649            + -self.twiddle1.im * x625n.re
5650            + self.twiddle4.im * x724n.re
5651            + self.twiddle9.im * x823n.re
5652            + self.twiddle14.im * x922n.re
5653            + -self.twiddle12.im * x1021n.re
5654            + -self.twiddle7.im * x1120n.re
5655            + -self.twiddle2.im * x1219n.re
5656            + self.twiddle3.im * x1318n.re
5657            + self.twiddle8.im * x1417n.re
5658            + self.twiddle13.im * x1516n.re;
5659        let b625im_a = buffer.load(0).im
5660            + self.twiddle6.re * x130p.im
5661            + self.twiddle12.re * x229p.im
5662            + self.twiddle13.re * x328p.im
5663            + self.twiddle7.re * x427p.im
5664            + self.twiddle1.re * x526p.im
5665            + self.twiddle5.re * x625p.im
5666            + self.twiddle11.re * x724p.im
5667            + self.twiddle14.re * x823p.im
5668            + self.twiddle8.re * x922p.im
5669            + self.twiddle2.re * x1021p.im
5670            + self.twiddle4.re * x1120p.im
5671            + self.twiddle10.re * x1219p.im
5672            + self.twiddle15.re * x1318p.im
5673            + self.twiddle9.re * x1417p.im
5674            + self.twiddle3.re * x1516p.im;
5675        let b625im_b = self.twiddle6.im * x130n.re
5676            + self.twiddle12.im * x229n.re
5677            + -self.twiddle13.im * x328n.re
5678            + -self.twiddle7.im * x427n.re
5679            + -self.twiddle1.im * x526n.re
5680            + self.twiddle5.im * x625n.re
5681            + self.twiddle11.im * x724n.re
5682            + -self.twiddle14.im * x823n.re
5683            + -self.twiddle8.im * x922n.re
5684            + -self.twiddle2.im * x1021n.re
5685            + self.twiddle4.im * x1120n.re
5686            + self.twiddle10.im * x1219n.re
5687            + -self.twiddle15.im * x1318n.re
5688            + -self.twiddle9.im * x1417n.re
5689            + -self.twiddle3.im * x1516n.re;
5690        let b724im_a = buffer.load(0).im
5691            + self.twiddle7.re * x130p.im
5692            + self.twiddle14.re * x229p.im
5693            + self.twiddle10.re * x328p.im
5694            + self.twiddle3.re * x427p.im
5695            + self.twiddle4.re * x526p.im
5696            + self.twiddle11.re * x625p.im
5697            + self.twiddle13.re * x724p.im
5698            + self.twiddle6.re * x823p.im
5699            + self.twiddle1.re * x922p.im
5700            + self.twiddle8.re * x1021p.im
5701            + self.twiddle15.re * x1120p.im
5702            + self.twiddle9.re * x1219p.im
5703            + self.twiddle2.re * x1318p.im
5704            + self.twiddle5.re * x1417p.im
5705            + self.twiddle12.re * x1516p.im;
5706        let b724im_b = self.twiddle7.im * x130n.re
5707            + self.twiddle14.im * x229n.re
5708            + -self.twiddle10.im * x328n.re
5709            + -self.twiddle3.im * x427n.re
5710            + self.twiddle4.im * x526n.re
5711            + self.twiddle11.im * x625n.re
5712            + -self.twiddle13.im * x724n.re
5713            + -self.twiddle6.im * x823n.re
5714            + self.twiddle1.im * x922n.re
5715            + self.twiddle8.im * x1021n.re
5716            + self.twiddle15.im * x1120n.re
5717            + -self.twiddle9.im * x1219n.re
5718            + -self.twiddle2.im * x1318n.re
5719            + self.twiddle5.im * x1417n.re
5720            + self.twiddle12.im * x1516n.re;
5721        let b823im_a = buffer.load(0).im
5722            + self.twiddle8.re * x130p.im
5723            + self.twiddle15.re * x229p.im
5724            + self.twiddle7.re * x328p.im
5725            + self.twiddle1.re * x427p.im
5726            + self.twiddle9.re * x526p.im
5727            + self.twiddle14.re * x625p.im
5728            + self.twiddle6.re * x724p.im
5729            + self.twiddle2.re * x823p.im
5730            + self.twiddle10.re * x922p.im
5731            + self.twiddle13.re * x1021p.im
5732            + self.twiddle5.re * x1120p.im
5733            + self.twiddle3.re * x1219p.im
5734            + self.twiddle11.re * x1318p.im
5735            + self.twiddle12.re * x1417p.im
5736            + self.twiddle4.re * x1516p.im;
5737        let b823im_b = self.twiddle8.im * x130n.re
5738            + -self.twiddle15.im * x229n.re
5739            + -self.twiddle7.im * x328n.re
5740            + self.twiddle1.im * x427n.re
5741            + self.twiddle9.im * x526n.re
5742            + -self.twiddle14.im * x625n.re
5743            + -self.twiddle6.im * x724n.re
5744            + self.twiddle2.im * x823n.re
5745            + self.twiddle10.im * x922n.re
5746            + -self.twiddle13.im * x1021n.re
5747            + -self.twiddle5.im * x1120n.re
5748            + self.twiddle3.im * x1219n.re
5749            + self.twiddle11.im * x1318n.re
5750            + -self.twiddle12.im * x1417n.re
5751            + -self.twiddle4.im * x1516n.re;
5752        let b922im_a = buffer.load(0).im
5753            + self.twiddle9.re * x130p.im
5754            + self.twiddle13.re * x229p.im
5755            + self.twiddle4.re * x328p.im
5756            + self.twiddle5.re * x427p.im
5757            + self.twiddle14.re * x526p.im
5758            + self.twiddle8.re * x625p.im
5759            + self.twiddle1.re * x724p.im
5760            + self.twiddle10.re * x823p.im
5761            + self.twiddle12.re * x922p.im
5762            + self.twiddle3.re * x1021p.im
5763            + self.twiddle6.re * x1120p.im
5764            + self.twiddle15.re * x1219p.im
5765            + self.twiddle7.re * x1318p.im
5766            + self.twiddle2.re * x1417p.im
5767            + self.twiddle11.re * x1516p.im;
5768        let b922im_b = self.twiddle9.im * x130n.re
5769            + -self.twiddle13.im * x229n.re
5770            + -self.twiddle4.im * x328n.re
5771            + self.twiddle5.im * x427n.re
5772            + self.twiddle14.im * x526n.re
5773            + -self.twiddle8.im * x625n.re
5774            + self.twiddle1.im * x724n.re
5775            + self.twiddle10.im * x823n.re
5776            + -self.twiddle12.im * x922n.re
5777            + -self.twiddle3.im * x1021n.re
5778            + self.twiddle6.im * x1120n.re
5779            + self.twiddle15.im * x1219n.re
5780            + -self.twiddle7.im * x1318n.re
5781            + self.twiddle2.im * x1417n.re
5782            + self.twiddle11.im * x1516n.re;
5783        let b1021im_a = buffer.load(0).im
5784            + self.twiddle10.re * x130p.im
5785            + self.twiddle11.re * x229p.im
5786            + self.twiddle1.re * x328p.im
5787            + self.twiddle9.re * x427p.im
5788            + self.twiddle12.re * x526p.im
5789            + self.twiddle2.re * x625p.im
5790            + self.twiddle8.re * x724p.im
5791            + self.twiddle13.re * x823p.im
5792            + self.twiddle3.re * x922p.im
5793            + self.twiddle7.re * x1021p.im
5794            + self.twiddle14.re * x1120p.im
5795            + self.twiddle4.re * x1219p.im
5796            + self.twiddle6.re * x1318p.im
5797            + self.twiddle15.re * x1417p.im
5798            + self.twiddle5.re * x1516p.im;
5799        let b1021im_b = self.twiddle10.im * x130n.re
5800            + -self.twiddle11.im * x229n.re
5801            + -self.twiddle1.im * x328n.re
5802            + self.twiddle9.im * x427n.re
5803            + -self.twiddle12.im * x526n.re
5804            + -self.twiddle2.im * x625n.re
5805            + self.twiddle8.im * x724n.re
5806            + -self.twiddle13.im * x823n.re
5807            + -self.twiddle3.im * x922n.re
5808            + self.twiddle7.im * x1021n.re
5809            + -self.twiddle14.im * x1120n.re
5810            + -self.twiddle4.im * x1219n.re
5811            + self.twiddle6.im * x1318n.re
5812            + -self.twiddle15.im * x1417n.re
5813            + -self.twiddle5.im * x1516n.re;
5814        let b1120im_a = buffer.load(0).im
5815            + self.twiddle11.re * x130p.im
5816            + self.twiddle9.re * x229p.im
5817            + self.twiddle2.re * x328p.im
5818            + self.twiddle13.re * x427p.im
5819            + self.twiddle7.re * x526p.im
5820            + self.twiddle4.re * x625p.im
5821            + self.twiddle15.re * x724p.im
5822            + self.twiddle5.re * x823p.im
5823            + self.twiddle6.re * x922p.im
5824            + self.twiddle14.re * x1021p.im
5825            + self.twiddle3.re * x1120p.im
5826            + self.twiddle8.re * x1219p.im
5827            + self.twiddle12.re * x1318p.im
5828            + self.twiddle1.re * x1417p.im
5829            + self.twiddle10.re * x1516p.im;
5830        let b1120im_b = self.twiddle11.im * x130n.re
5831            + -self.twiddle9.im * x229n.re
5832            + self.twiddle2.im * x328n.re
5833            + self.twiddle13.im * x427n.re
5834            + -self.twiddle7.im * x526n.re
5835            + self.twiddle4.im * x625n.re
5836            + self.twiddle15.im * x724n.re
5837            + -self.twiddle5.im * x823n.re
5838            + self.twiddle6.im * x922n.re
5839            + -self.twiddle14.im * x1021n.re
5840            + -self.twiddle3.im * x1120n.re
5841            + self.twiddle8.im * x1219n.re
5842            + -self.twiddle12.im * x1318n.re
5843            + -self.twiddle1.im * x1417n.re
5844            + self.twiddle10.im * x1516n.re;
5845        let b1219im_a = buffer.load(0).im
5846            + self.twiddle12.re * x130p.im
5847            + self.twiddle7.re * x229p.im
5848            + self.twiddle5.re * x328p.im
5849            + self.twiddle14.re * x427p.im
5850            + self.twiddle2.re * x526p.im
5851            + self.twiddle10.re * x625p.im
5852            + self.twiddle9.re * x724p.im
5853            + self.twiddle3.re * x823p.im
5854            + self.twiddle15.re * x922p.im
5855            + self.twiddle4.re * x1021p.im
5856            + self.twiddle8.re * x1120p.im
5857            + self.twiddle11.re * x1219p.im
5858            + self.twiddle1.re * x1318p.im
5859            + self.twiddle13.re * x1417p.im
5860            + self.twiddle6.re * x1516p.im;
5861        let b1219im_b = self.twiddle12.im * x130n.re
5862            + -self.twiddle7.im * x229n.re
5863            + self.twiddle5.im * x328n.re
5864            + -self.twiddle14.im * x427n.re
5865            + -self.twiddle2.im * x526n.re
5866            + self.twiddle10.im * x625n.re
5867            + -self.twiddle9.im * x724n.re
5868            + self.twiddle3.im * x823n.re
5869            + self.twiddle15.im * x922n.re
5870            + -self.twiddle4.im * x1021n.re
5871            + self.twiddle8.im * x1120n.re
5872            + -self.twiddle11.im * x1219n.re
5873            + self.twiddle1.im * x1318n.re
5874            + self.twiddle13.im * x1417n.re
5875            + -self.twiddle6.im * x1516n.re;
5876        let b1318im_a = buffer.load(0).im
5877            + self.twiddle13.re * x130p.im
5878            + self.twiddle5.re * x229p.im
5879            + self.twiddle8.re * x328p.im
5880            + self.twiddle10.re * x427p.im
5881            + self.twiddle3.re * x526p.im
5882            + self.twiddle15.re * x625p.im
5883            + self.twiddle2.re * x724p.im
5884            + self.twiddle11.re * x823p.im
5885            + self.twiddle7.re * x922p.im
5886            + self.twiddle6.re * x1021p.im
5887            + self.twiddle12.re * x1120p.im
5888            + self.twiddle1.re * x1219p.im
5889            + self.twiddle14.re * x1318p.im
5890            + self.twiddle4.re * x1417p.im
5891            + self.twiddle9.re * x1516p.im;
5892        let b1318im_b = self.twiddle13.im * x130n.re
5893            + -self.twiddle5.im * x229n.re
5894            + self.twiddle8.im * x328n.re
5895            + -self.twiddle10.im * x427n.re
5896            + self.twiddle3.im * x526n.re
5897            + -self.twiddle15.im * x625n.re
5898            + -self.twiddle2.im * x724n.re
5899            + self.twiddle11.im * x823n.re
5900            + -self.twiddle7.im * x922n.re
5901            + self.twiddle6.im * x1021n.re
5902            + -self.twiddle12.im * x1120n.re
5903            + self.twiddle1.im * x1219n.re
5904            + self.twiddle14.im * x1318n.re
5905            + -self.twiddle4.im * x1417n.re
5906            + self.twiddle9.im * x1516n.re;
5907        let b1417im_a = buffer.load(0).im
5908            + self.twiddle14.re * x130p.im
5909            + self.twiddle3.re * x229p.im
5910            + self.twiddle11.re * x328p.im
5911            + self.twiddle6.re * x427p.im
5912            + self.twiddle8.re * x526p.im
5913            + self.twiddle9.re * x625p.im
5914            + self.twiddle5.re * x724p.im
5915            + self.twiddle12.re * x823p.im
5916            + self.twiddle2.re * x922p.im
5917            + self.twiddle15.re * x1021p.im
5918            + self.twiddle1.re * x1120p.im
5919            + self.twiddle13.re * x1219p.im
5920            + self.twiddle4.re * x1318p.im
5921            + self.twiddle10.re * x1417p.im
5922            + self.twiddle7.re * x1516p.im;
5923        let b1417im_b = self.twiddle14.im * x130n.re
5924            + -self.twiddle3.im * x229n.re
5925            + self.twiddle11.im * x328n.re
5926            + -self.twiddle6.im * x427n.re
5927            + self.twiddle8.im * x526n.re
5928            + -self.twiddle9.im * x625n.re
5929            + self.twiddle5.im * x724n.re
5930            + -self.twiddle12.im * x823n.re
5931            + self.twiddle2.im * x922n.re
5932            + -self.twiddle15.im * x1021n.re
5933            + -self.twiddle1.im * x1120n.re
5934            + self.twiddle13.im * x1219n.re
5935            + -self.twiddle4.im * x1318n.re
5936            + self.twiddle10.im * x1417n.re
5937            + -self.twiddle7.im * x1516n.re;
5938        let b1516im_a = buffer.load(0).im
5939            + self.twiddle15.re * x130p.im
5940            + self.twiddle1.re * x229p.im
5941            + self.twiddle14.re * x328p.im
5942            + self.twiddle2.re * x427p.im
5943            + self.twiddle13.re * x526p.im
5944            + self.twiddle3.re * x625p.im
5945            + self.twiddle12.re * x724p.im
5946            + self.twiddle4.re * x823p.im
5947            + self.twiddle11.re * x922p.im
5948            + self.twiddle5.re * x1021p.im
5949            + self.twiddle10.re * x1120p.im
5950            + self.twiddle6.re * x1219p.im
5951            + self.twiddle9.re * x1318p.im
5952            + self.twiddle7.re * x1417p.im
5953            + self.twiddle8.re * x1516p.im;
5954        let b1516im_b = self.twiddle15.im * x130n.re
5955            + -self.twiddle1.im * x229n.re
5956            + self.twiddle14.im * x328n.re
5957            + -self.twiddle2.im * x427n.re
5958            + self.twiddle13.im * x526n.re
5959            + -self.twiddle3.im * x625n.re
5960            + self.twiddle12.im * x724n.re
5961            + -self.twiddle4.im * x823n.re
5962            + self.twiddle11.im * x922n.re
5963            + -self.twiddle5.im * x1021n.re
5964            + self.twiddle10.im * x1120n.re
5965            + -self.twiddle6.im * x1219n.re
5966            + self.twiddle9.im * x1318n.re
5967            + -self.twiddle7.im * x1417n.re
5968            + self.twiddle8.im * x1516n.re;
5969
5970        let out1re = b130re_a - b130re_b;
5971        let out1im = b130im_a + b130im_b;
5972        let out2re = b229re_a - b229re_b;
5973        let out2im = b229im_a + b229im_b;
5974        let out3re = b328re_a - b328re_b;
5975        let out3im = b328im_a + b328im_b;
5976        let out4re = b427re_a - b427re_b;
5977        let out4im = b427im_a + b427im_b;
5978        let out5re = b526re_a - b526re_b;
5979        let out5im = b526im_a + b526im_b;
5980        let out6re = b625re_a - b625re_b;
5981        let out6im = b625im_a + b625im_b;
5982        let out7re = b724re_a - b724re_b;
5983        let out7im = b724im_a + b724im_b;
5984        let out8re = b823re_a - b823re_b;
5985        let out8im = b823im_a + b823im_b;
5986        let out9re = b922re_a - b922re_b;
5987        let out9im = b922im_a + b922im_b;
5988        let out10re = b1021re_a - b1021re_b;
5989        let out10im = b1021im_a + b1021im_b;
5990        let out11re = b1120re_a - b1120re_b;
5991        let out11im = b1120im_a + b1120im_b;
5992        let out12re = b1219re_a - b1219re_b;
5993        let out12im = b1219im_a + b1219im_b;
5994        let out13re = b1318re_a - b1318re_b;
5995        let out13im = b1318im_a + b1318im_b;
5996        let out14re = b1417re_a - b1417re_b;
5997        let out14im = b1417im_a + b1417im_b;
5998        let out15re = b1516re_a - b1516re_b;
5999        let out15im = b1516im_a + b1516im_b;
6000        let out16re = b1516re_a + b1516re_b;
6001        let out16im = b1516im_a - b1516im_b;
6002        let out17re = b1417re_a + b1417re_b;
6003        let out17im = b1417im_a - b1417im_b;
6004        let out18re = b1318re_a + b1318re_b;
6005        let out18im = b1318im_a - b1318im_b;
6006        let out19re = b1219re_a + b1219re_b;
6007        let out19im = b1219im_a - b1219im_b;
6008        let out20re = b1120re_a + b1120re_b;
6009        let out20im = b1120im_a - b1120im_b;
6010        let out21re = b1021re_a + b1021re_b;
6011        let out21im = b1021im_a - b1021im_b;
6012        let out22re = b922re_a + b922re_b;
6013        let out22im = b922im_a - b922im_b;
6014        let out23re = b823re_a + b823re_b;
6015        let out23im = b823im_a - b823im_b;
6016        let out24re = b724re_a + b724re_b;
6017        let out24im = b724im_a - b724im_b;
6018        let out25re = b625re_a + b625re_b;
6019        let out25im = b625im_a - b625im_b;
6020        let out26re = b526re_a + b526re_b;
6021        let out26im = b526im_a - b526im_b;
6022        let out27re = b427re_a + b427re_b;
6023        let out27im = b427im_a - b427im_b;
6024        let out28re = b328re_a + b328re_b;
6025        let out28im = b328im_a - b328im_b;
6026        let out29re = b229re_a + b229re_b;
6027        let out29im = b229im_a - b229im_b;
6028        let out30re = b130re_a + b130re_b;
6029        let out30im = b130im_a - b130im_b;
6030        buffer.store(sum, 0);
6031        buffer.store(
6032            Complex {
6033                re: out1re,
6034                im: out1im,
6035            },
6036            1,
6037        );
6038        buffer.store(
6039            Complex {
6040                re: out2re,
6041                im: out2im,
6042            },
6043            2,
6044        );
6045        buffer.store(
6046            Complex {
6047                re: out3re,
6048                im: out3im,
6049            },
6050            3,
6051        );
6052        buffer.store(
6053            Complex {
6054                re: out4re,
6055                im: out4im,
6056            },
6057            4,
6058        );
6059        buffer.store(
6060            Complex {
6061                re: out5re,
6062                im: out5im,
6063            },
6064            5,
6065        );
6066        buffer.store(
6067            Complex {
6068                re: out6re,
6069                im: out6im,
6070            },
6071            6,
6072        );
6073        buffer.store(
6074            Complex {
6075                re: out7re,
6076                im: out7im,
6077            },
6078            7,
6079        );
6080        buffer.store(
6081            Complex {
6082                re: out8re,
6083                im: out8im,
6084            },
6085            8,
6086        );
6087        buffer.store(
6088            Complex {
6089                re: out9re,
6090                im: out9im,
6091            },
6092            9,
6093        );
6094        buffer.store(
6095            Complex {
6096                re: out10re,
6097                im: out10im,
6098            },
6099            10,
6100        );
6101        buffer.store(
6102            Complex {
6103                re: out11re,
6104                im: out11im,
6105            },
6106            11,
6107        );
6108        buffer.store(
6109            Complex {
6110                re: out12re,
6111                im: out12im,
6112            },
6113            12,
6114        );
6115        buffer.store(
6116            Complex {
6117                re: out13re,
6118                im: out13im,
6119            },
6120            13,
6121        );
6122        buffer.store(
6123            Complex {
6124                re: out14re,
6125                im: out14im,
6126            },
6127            14,
6128        );
6129        buffer.store(
6130            Complex {
6131                re: out15re,
6132                im: out15im,
6133            },
6134            15,
6135        );
6136        buffer.store(
6137            Complex {
6138                re: out16re,
6139                im: out16im,
6140            },
6141            16,
6142        );
6143        buffer.store(
6144            Complex {
6145                re: out17re,
6146                im: out17im,
6147            },
6148            17,
6149        );
6150        buffer.store(
6151            Complex {
6152                re: out18re,
6153                im: out18im,
6154            },
6155            18,
6156        );
6157        buffer.store(
6158            Complex {
6159                re: out19re,
6160                im: out19im,
6161            },
6162            19,
6163        );
6164        buffer.store(
6165            Complex {
6166                re: out20re,
6167                im: out20im,
6168            },
6169            20,
6170        );
6171        buffer.store(
6172            Complex {
6173                re: out21re,
6174                im: out21im,
6175            },
6176            21,
6177        );
6178        buffer.store(
6179            Complex {
6180                re: out22re,
6181                im: out22im,
6182            },
6183            22,
6184        );
6185        buffer.store(
6186            Complex {
6187                re: out23re,
6188                im: out23im,
6189            },
6190            23,
6191        );
6192        buffer.store(
6193            Complex {
6194                re: out24re,
6195                im: out24im,
6196            },
6197            24,
6198        );
6199        buffer.store(
6200            Complex {
6201                re: out25re,
6202                im: out25im,
6203            },
6204            25,
6205        );
6206        buffer.store(
6207            Complex {
6208                re: out26re,
6209                im: out26im,
6210            },
6211            26,
6212        );
6213        buffer.store(
6214            Complex {
6215                re: out27re,
6216                im: out27im,
6217            },
6218            27,
6219        );
6220        buffer.store(
6221            Complex {
6222                re: out28re,
6223                im: out28im,
6224            },
6225            28,
6226        );
6227        buffer.store(
6228            Complex {
6229                re: out29re,
6230                im: out29im,
6231            },
6232            29,
6233        );
6234        buffer.store(
6235            Complex {
6236                re: out30re,
6237                im: out30im,
6238            },
6239            30,
6240        );
6241    }
6242}
6243pub struct Butterfly32<T> {
6244    butterfly16: Butterfly16<T>,
6245    butterfly8: Butterfly8<T>,
6246    twiddles: [Complex<T>; 7],
6247}
6248boilerplate_fft_butterfly!(Butterfly32, 32, |this: &Butterfly32<_>| this
6249    .butterfly8
6250    .fft_direction());
6251impl<T: FftNum> Butterfly32<T> {
6252    pub fn new(direction: FftDirection) -> Self {
6253        Self {
6254            butterfly16: Butterfly16::new(direction),
6255            butterfly8: Butterfly8::new(direction),
6256            twiddles: [
6257                twiddles::compute_twiddle(1, 32, direction),
6258                twiddles::compute_twiddle(2, 32, direction),
6259                twiddles::compute_twiddle(3, 32, direction),
6260                twiddles::compute_twiddle(4, 32, direction),
6261                twiddles::compute_twiddle(5, 32, direction),
6262                twiddles::compute_twiddle(6, 32, direction),
6263                twiddles::compute_twiddle(7, 32, direction),
6264            ],
6265        }
6266    }
6267
6268    #[inline(never)]
6269    unsafe fn perform_fft_contiguous(&self, mut buffer: impl LoadStore<T>) {
6270        // we're going to hardcode a step of split radix
6271        // step 1: copy and reorder the  input into the scratch
6272        let mut scratch_evens = [
6273            buffer.load(0),
6274            buffer.load(2),
6275            buffer.load(4),
6276            buffer.load(6),
6277            buffer.load(8),
6278            buffer.load(10),
6279            buffer.load(12),
6280            buffer.load(14),
6281            buffer.load(16),
6282            buffer.load(18),
6283            buffer.load(20),
6284            buffer.load(22),
6285            buffer.load(24),
6286            buffer.load(26),
6287            buffer.load(28),
6288            buffer.load(30),
6289        ];
6290
6291        let mut scratch_odds_n1 = [
6292            buffer.load(1),
6293            buffer.load(5),
6294            buffer.load(9),
6295            buffer.load(13),
6296            buffer.load(17),
6297            buffer.load(21),
6298            buffer.load(25),
6299            buffer.load(29),
6300        ];
6301        let mut scratch_odds_n3 = [
6302            buffer.load(31),
6303            buffer.load(3),
6304            buffer.load(7),
6305            buffer.load(11),
6306            buffer.load(15),
6307            buffer.load(19),
6308            buffer.load(23),
6309            buffer.load(27),
6310        ];
6311
6312        // step 2: column FFTs
6313        self.butterfly16.perform_fft_contiguous(&mut scratch_evens);
6314        self.butterfly8.perform_fft_contiguous(&mut scratch_odds_n1);
6315        self.butterfly8.perform_fft_contiguous(&mut scratch_odds_n3);
6316
6317        // step 3: apply twiddle factors
6318        scratch_odds_n1[1] = scratch_odds_n1[1] * self.twiddles[0];
6319        scratch_odds_n3[1] = scratch_odds_n3[1] * self.twiddles[0].conj();
6320
6321        scratch_odds_n1[2] = scratch_odds_n1[2] * self.twiddles[1];
6322        scratch_odds_n3[2] = scratch_odds_n3[2] * self.twiddles[1].conj();
6323
6324        scratch_odds_n1[3] = scratch_odds_n1[3] * self.twiddles[2];
6325        scratch_odds_n3[3] = scratch_odds_n3[3] * self.twiddles[2].conj();
6326
6327        scratch_odds_n1[4] = scratch_odds_n1[4] * self.twiddles[3];
6328        scratch_odds_n3[4] = scratch_odds_n3[4] * self.twiddles[3].conj();
6329
6330        scratch_odds_n1[5] = scratch_odds_n1[5] * self.twiddles[4];
6331        scratch_odds_n3[5] = scratch_odds_n3[5] * self.twiddles[4].conj();
6332
6333        scratch_odds_n1[6] = scratch_odds_n1[6] * self.twiddles[5];
6334        scratch_odds_n3[6] = scratch_odds_n3[6] * self.twiddles[5].conj();
6335
6336        scratch_odds_n1[7] = scratch_odds_n1[7] * self.twiddles[6];
6337        scratch_odds_n3[7] = scratch_odds_n3[7] * self.twiddles[6].conj();
6338
6339        // step 4: cross FFTs
6340        Butterfly2::perform_fft_strided(&mut scratch_odds_n1[0], &mut scratch_odds_n3[0]);
6341        Butterfly2::perform_fft_strided(&mut scratch_odds_n1[1], &mut scratch_odds_n3[1]);
6342        Butterfly2::perform_fft_strided(&mut scratch_odds_n1[2], &mut scratch_odds_n3[2]);
6343        Butterfly2::perform_fft_strided(&mut scratch_odds_n1[3], &mut scratch_odds_n3[3]);
6344        Butterfly2::perform_fft_strided(&mut scratch_odds_n1[4], &mut scratch_odds_n3[4]);
6345        Butterfly2::perform_fft_strided(&mut scratch_odds_n1[5], &mut scratch_odds_n3[5]);
6346        Butterfly2::perform_fft_strided(&mut scratch_odds_n1[6], &mut scratch_odds_n3[6]);
6347        Butterfly2::perform_fft_strided(&mut scratch_odds_n1[7], &mut scratch_odds_n3[7]);
6348
6349        // apply the butterfly 4 twiddle factor, which is just a rotation
6350        scratch_odds_n3[0] = twiddles::rotate_90(scratch_odds_n3[0], self.fft_direction());
6351        scratch_odds_n3[1] = twiddles::rotate_90(scratch_odds_n3[1], self.fft_direction());
6352        scratch_odds_n3[2] = twiddles::rotate_90(scratch_odds_n3[2], self.fft_direction());
6353        scratch_odds_n3[3] = twiddles::rotate_90(scratch_odds_n3[3], self.fft_direction());
6354        scratch_odds_n3[4] = twiddles::rotate_90(scratch_odds_n3[4], self.fft_direction());
6355        scratch_odds_n3[5] = twiddles::rotate_90(scratch_odds_n3[5], self.fft_direction());
6356        scratch_odds_n3[6] = twiddles::rotate_90(scratch_odds_n3[6], self.fft_direction());
6357        scratch_odds_n3[7] = twiddles::rotate_90(scratch_odds_n3[7], self.fft_direction());
6358
6359        //step 5: copy/add/subtract data back to buffer
6360        buffer.store(scratch_evens[0] + scratch_odds_n1[0], 0);
6361        buffer.store(scratch_evens[1] + scratch_odds_n1[1], 1);
6362        buffer.store(scratch_evens[2] + scratch_odds_n1[2], 2);
6363        buffer.store(scratch_evens[3] + scratch_odds_n1[3], 3);
6364        buffer.store(scratch_evens[4] + scratch_odds_n1[4], 4);
6365        buffer.store(scratch_evens[5] + scratch_odds_n1[5], 5);
6366        buffer.store(scratch_evens[6] + scratch_odds_n1[6], 6);
6367        buffer.store(scratch_evens[7] + scratch_odds_n1[7], 7);
6368        buffer.store(scratch_evens[8] + scratch_odds_n3[0], 8);
6369        buffer.store(scratch_evens[9] + scratch_odds_n3[1], 9);
6370        buffer.store(scratch_evens[10] + scratch_odds_n3[2], 10);
6371        buffer.store(scratch_evens[11] + scratch_odds_n3[3], 11);
6372        buffer.store(scratch_evens[12] + scratch_odds_n3[4], 12);
6373        buffer.store(scratch_evens[13] + scratch_odds_n3[5], 13);
6374        buffer.store(scratch_evens[14] + scratch_odds_n3[6], 14);
6375        buffer.store(scratch_evens[15] + scratch_odds_n3[7], 15);
6376        buffer.store(scratch_evens[0] - scratch_odds_n1[0], 16);
6377        buffer.store(scratch_evens[1] - scratch_odds_n1[1], 17);
6378        buffer.store(scratch_evens[2] - scratch_odds_n1[2], 18);
6379        buffer.store(scratch_evens[3] - scratch_odds_n1[3], 19);
6380        buffer.store(scratch_evens[4] - scratch_odds_n1[4], 20);
6381        buffer.store(scratch_evens[5] - scratch_odds_n1[5], 21);
6382        buffer.store(scratch_evens[6] - scratch_odds_n1[6], 22);
6383        buffer.store(scratch_evens[7] - scratch_odds_n1[7], 23);
6384        buffer.store(scratch_evens[8] - scratch_odds_n3[0], 24);
6385        buffer.store(scratch_evens[9] - scratch_odds_n3[1], 25);
6386        buffer.store(scratch_evens[10] - scratch_odds_n3[2], 26);
6387        buffer.store(scratch_evens[11] - scratch_odds_n3[3], 27);
6388        buffer.store(scratch_evens[12] - scratch_odds_n3[4], 28);
6389        buffer.store(scratch_evens[13] - scratch_odds_n3[5], 29);
6390        buffer.store(scratch_evens[14] - scratch_odds_n3[6], 30);
6391        buffer.store(scratch_evens[15] - scratch_odds_n3[7], 31);
6392    }
6393}
6394
6395#[cfg(test)]
6396mod unit_tests {
6397    use super::*;
6398    use crate::test_utils::check_fft_algorithm;
6399
6400    //the tests for all butterflies will be identical except for the identifiers used and size
6401    //so it's ideal for a macro
6402    macro_rules! test_butterfly_func {
6403        ($test_name:ident, $struct_name:ident, $size:expr) => {
6404            #[test]
6405            fn $test_name() {
6406                let butterfly = $struct_name::new(FftDirection::Forward);
6407                check_fft_algorithm::<f32>(&butterfly, $size, FftDirection::Forward);
6408
6409                let butterfly_direction = $struct_name::new(FftDirection::Inverse);
6410                check_fft_algorithm::<f32>(&butterfly_direction, $size, FftDirection::Inverse);
6411            }
6412        };
6413    }
6414    test_butterfly_func!(test_butterfly2, Butterfly2, 2);
6415    test_butterfly_func!(test_butterfly3, Butterfly3, 3);
6416    test_butterfly_func!(test_butterfly4, Butterfly4, 4);
6417    test_butterfly_func!(test_butterfly5, Butterfly5, 5);
6418    test_butterfly_func!(test_butterfly6, Butterfly6, 6);
6419    test_butterfly_func!(test_butterfly7, Butterfly7, 7);
6420    test_butterfly_func!(test_butterfly8, Butterfly8, 8);
6421    test_butterfly_func!(test_butterfly9, Butterfly9, 9);
6422    test_butterfly_func!(test_butterfly11, Butterfly11, 11);
6423    test_butterfly_func!(test_butterfly12, Butterfly12, 12);
6424    test_butterfly_func!(test_butterfly13, Butterfly13, 13);
6425    test_butterfly_func!(test_butterfly16, Butterfly16, 16);
6426    test_butterfly_func!(test_butterfly17, Butterfly17, 17);
6427    test_butterfly_func!(test_butterfly19, Butterfly19, 19);
6428    test_butterfly_func!(test_butterfly23, Butterfly23, 23);
6429    test_butterfly_func!(test_butterfly24, Butterfly24, 24);
6430    test_butterfly_func!(test_butterfly27, Butterfly27, 27);
6431    test_butterfly_func!(test_butterfly29, Butterfly29, 29);
6432    test_butterfly_func!(test_butterfly31, Butterfly31, 31);
6433    test_butterfly_func!(test_butterfly32, Butterfly32, 32);
6434}