feanor_math/algorithms/fft/
cooley_tuckey.rs

1use std::alloc::{Allocator, Global};
2use std::ops::Range;
3use std::fmt::Debug;
4
5use crate::algorithms::unity_root::*;
6use crate::divisibility::{DivisibilityRingStore, DivisibilityRing};
7use crate::rings::zn::*;
8use crate::seq::SwappableVectorViewMut;
9use crate::ring::*;
10use crate::seq::VectorViewMut;
11use crate::homomorphism::*;
12use crate::algorithms::fft::*;
13use crate::rings::float_complex::*;
14use super::complex_fft::*;
15
16///
17/// An optimized implementation of the Cooley-Tukey FFT algorithm, to compute
18/// the Fourier transform of an array with power-of-two length.
19/// 
20/// # Example
21/// ```rust
22/// # use feanor_math::assert_el_eq;
23/// # use feanor_math::ring::*;
24/// # use feanor_math::algorithms::fft::*;
25/// # use feanor_math::rings::zn::*;
26/// # use feanor_math::algorithms::fft::cooley_tuckey::*;
27/// // this ring has a 256-th primitive root of unity
28/// let ring = zn_64::Zn::new(257);
29/// let fft_table = CooleyTuckeyFFT::for_zn(ring, 8).unwrap();
30/// let mut data = [ring.one()].into_iter().chain((0..255).map(|_| ring.zero())).collect::<Vec<_>>();
31/// fft_table.unordered_fft(&mut data, &ring);
32/// assert_el_eq!(ring, ring.one(), data[0]);
33/// assert_el_eq!(ring, ring.one(), data[1]);
34/// ```
35/// 
36/// # Convention
37/// 
38/// This implementation does not follows the standard convention for the mathematical
39/// DFT, by performing the standard/forward FFT with the inverse root of unity `z^-1`.
40/// In other words, the forward FFT computes
41/// ```text
42///   (a_0, ..., a_(n - 1)) -> (sum_j a_j z^(-ij))_i
43/// ```
44/// as demonstrated by
45/// ```rust
46/// # use feanor_math::assert_el_eq;
47/// # use feanor_math::ring::*;
48/// # use feanor_math::algorithms::fft::*;
49/// # use feanor_math::rings::zn::*;
50/// # use feanor_math::algorithms::fft::cooley_tuckey::*;
51/// # use feanor_math::homomorphism::*;
52/// # use feanor_math::divisibility::*;
53/// // this ring has a 4-th primitive root of unity
54/// let ring = zn_64::Zn::new(5);
55/// let root_of_unity = ring.int_hom().map(2);
56/// let fft_table = CooleyTuckeyFFT::new(ring, root_of_unity, 2);
57/// let mut data = [ring.one(), ring.one(), ring.zero(), ring.zero()];
58/// fft_table.fft(&mut data, ring);
59/// let inv_root_of_unity = ring.invert(&root_of_unity).unwrap();
60/// assert_el_eq!(ring, ring.add(ring.one(), inv_root_of_unity), data[1]);
61/// ```
62/// 
63/// # On optimizations
64/// 
65/// I tried my best to make this as fast as possible in general, with special focus
66/// on the Number-theoretic transform case. I did not implement the following
67/// optimizations, for the following reasons:
68///  - Larger butterflies: This would improve data locality, but decrease twiddle
69///    locality (or increase arithmetic operation count). Since I focused mainly on
70///    the `Z/nZ` case, where the twiddles are larger than the ring elements (since they
71///    have additional data to speed up multiplications), this is not sensible.
72///  - The same reasoning applies to a SplitRadix approach, which only actually decreases
73///    the total number of operations if multiplication-by-`i` is free.
74/// 
75pub struct CooleyTuckeyFFT<R_main, R_twiddle, H, A = Global> 
76    where R_main: ?Sized + RingBase,
77        R_twiddle: ?Sized + RingBase,
78        H: Homomorphism<R_twiddle, R_main>,
79        A: Allocator
80{
81    hom: H,
82    root_of_unity: R_main::Element,
83    log2_n: usize,
84    // stores the powers of `root_of_unity^-1` in special bitreversed order
85    root_of_unity_list: Vec<Vec<R_twiddle::Element>>,
86    // stores the powers of `root_of_unity` in special bitreversed order
87    inv_root_of_unity_list: Vec<Vec<R_twiddle::Element>>,
88    allocator: A,
89    two_inv: R_twiddle::Element,
90    n_inv: R_twiddle::Element
91}
92
93///
94/// Assumes that `index` has only the least significant `bits` bits set.
95/// Then computes the value that results from reversing the least significant `bits`
96/// bits.
97/// 
98pub fn bitreverse(index: usize, bits: usize) -> usize {
99    index.reverse_bits().checked_shr(usize::BITS - bits as u32).unwrap_or(0)
100}
101
102#[inline(never)]
103fn butterfly_loop<T, S, F>(log2_n: usize, data: &mut [T], butterfly_range: Range<usize>, stride_range: Range<usize>, log2_step: usize, twiddles: &[S], butterfly: F)
104    where F: Fn(&mut T, &mut T, &S) + Clone
105{
106    assert_eq!(1 << log2_n, data.len());
107    assert!(log2_step < log2_n);
108
109    // the coefficients of a group of inputs have this distance to each other
110    let stride = 1 << (log2_n - log2_step - 1);
111    assert!(stride_range.start <= stride_range.end);
112    assert!(stride_range.end <= stride);
113
114    // how many butterflies we compute within each group
115    assert!(butterfly_range.start <= butterfly_range.end);
116    assert!(butterfly_range.end <= (1 << log2_step));
117    assert!(butterfly_range.end <= twiddles.len());
118    
119    let current_data = &mut data[(stride_range.start + butterfly_range.start * 2 * stride)..];
120    let stride_range_len = stride_range.end - stride_range.start;
121    
122    if stride == 1 && stride_range_len == 1 {
123        for (twiddle, butterfly_data) in twiddles[butterfly_range].iter().zip(current_data.as_chunks_mut::<2>().0.iter_mut()) {
124            let [a, b] = butterfly_data.each_mut();
125            butterfly(a, b, &twiddle);
126        }
127    } else if stride_range_len >= 1 {
128        for (twiddle, butterfly_data) in twiddles[butterfly_range].iter().zip(current_data.chunks_mut(2 * stride)) {
129            let (first, second) = butterfly_data[..(stride + stride_range_len)].split_at_mut(stride);
130            let (first_chunks, first_rem) = first[..stride_range_len].as_chunks_mut::<4>();
131            let (second_chunks, second_rem) = second.as_chunks_mut::<4>();
132            for (a, b) in first_chunks.iter_mut().zip(second_chunks.iter_mut()) {
133                butterfly(&mut a[0], &mut b[0], &twiddle);
134                butterfly(&mut a[1], &mut b[1], &twiddle);
135                butterfly(&mut a[2], &mut b[2], &twiddle);
136                butterfly(&mut a[3], &mut b[3], &twiddle);
137            }
138            for (a, b) in first_rem.iter_mut().zip(second_rem.iter_mut()) {
139                butterfly(a, b, &twiddle);
140            }
141        }
142    }
143}
144
145impl<R_main, H> CooleyTuckeyFFT<R_main, Complex64Base, H, Global> 
146    where R_main: ?Sized + RingBase,
147        H: Homomorphism<Complex64Base, R_main>
148{
149    ///
150    /// Creates an [`CooleyTuckeyFFT`] for the complex field, using the given homomorphism
151    /// to connect the ring implementation for twiddles with the main ring implementation.
152    /// 
153    /// This function is mainly provided for parity with other rings, since in the complex case
154    /// it currently does not make much sense to use a different homomorphism than the identity.
155    /// Hence, it is simpler to use [`CooleyTuckeyFFT::for_complex()`].
156    /// 
157    pub fn for_complex_with_hom(hom: H, log2_n: usize) -> Self {
158        let CC = *hom.domain().get_ring();
159        Self::new_with_pows_with_hom(hom, |i| CC.root_of_unity(i, 1 << log2_n), log2_n)
160    }
161}
162
163impl<R> CooleyTuckeyFFT<Complex64Base, Complex64Base, Identity<R>, Global> 
164    where R: RingStore<Type = Complex64Base>
165{
166    ///
167    /// Creates an [`CooleyTuckeyFFT`] for the complex field.
168    /// 
169    pub fn for_complex(ring: R, log2_n: usize) -> Self {
170        Self::for_complex_with_hom(ring.into_identity(), log2_n)
171    }
172}
173
174impl<R> CooleyTuckeyFFT<R::Type, R::Type, Identity<R>, Global> 
175    where R: RingStore,
176        R::Type: DivisibilityRing
177{
178    ///
179    /// Creates an [`CooleyTuckeyFFT`] for the given ring, using the given root of unity. 
180    /// 
181    /// Do not use this for approximate rings, as computing the powers of `root_of_unity`
182    /// will incur avoidable precision loss.
183    /// 
184    pub fn new(ring: R, root_of_unity: El<R>, log2_n: usize) -> Self {
185        Self::new_with_hom(ring.into_identity(), root_of_unity, log2_n)
186    }
187
188    ///
189    /// Creates an [`CooleyTuckeyFFT`] for the given ring, using the passed function to
190    /// provide the necessary roots of unity.
191    /// 
192    /// Concretely, `root_of_unity_pow(i)` should return `z^i`, where `z` is a `2^log2_n`-th
193    /// primitive root of unity.
194    /// 
195    pub fn new_with_pows<F>(ring: R, root_of_unity_pow: F, log2_n: usize) -> Self 
196        where F: FnMut(i64) -> El<R>
197    {
198        Self::new_with_pows_with_hom(ring.into_identity(), root_of_unity_pow, log2_n)
199    }
200
201    ///
202    /// Creates an [`CooleyTuckeyFFT`] for a prime field, assuming it has a characteristic
203    /// congruent to 1 modulo `2^log2_n`.
204    /// 
205    pub fn for_zn(ring: R, log2_n: usize) -> Option<Self>
206        where R::Type: ZnRing
207    {
208        Self::for_zn_with_hom(ring.into_identity(), log2_n)
209    }
210}
211
212impl<R_main, R_twiddle, H> CooleyTuckeyFFT<R_main, R_twiddle, H, Global> 
213    where R_main: ?Sized + RingBase,
214        R_twiddle: ?Sized + RingBase + DivisibilityRing,
215        H: Homomorphism<R_twiddle, R_main>
216{
217    ///
218    /// Creates an [`CooleyTuckeyFFT`] for the given rings, using the given root of unity.
219    /// 
220    /// Instead of a ring, this function takes a homomorphism `R -> S`. Twiddle factors that are
221    /// precomputed will be stored as elements of `R`, while the main FFT computations will be 
222    /// performed in `S`. This allows both implicit ring conversions, and using patterns like 
223    /// [`zn_64::ZnFastmul`] to precompute some data for better performance.
224    /// 
225    /// Do not use this for approximate rings, as computing the powers of `root_of_unity`
226    /// will incur avoidable precision loss.
227    /// 
228    pub fn new_with_hom(hom: H, root_of_unity: R_twiddle::Element, log2_n: usize) -> Self {
229        let ring = hom.domain();
230        let root_of_unity_pow = |i: i64| if i >= 0 {
231            ring.pow(ring.clone_el(&root_of_unity), i as usize)
232        } else {
233            ring.invert(&ring.pow(ring.clone_el(&root_of_unity), (-i) as usize)).unwrap()
234        };
235        let result = CooleyTuckeyFFT::create(&hom, root_of_unity_pow, log2_n, Global);
236        
237        return CooleyTuckeyFFT {
238            root_of_unity_list: result.root_of_unity_list,
239            inv_root_of_unity_list: result.inv_root_of_unity_list,
240            two_inv: result.two_inv,
241            n_inv: result.n_inv,
242            root_of_unity: result.root_of_unity, 
243            log2_n: result.log2_n, 
244            allocator: result.allocator,
245            hom: hom, 
246        };
247    }
248
249    ///
250    /// Creates an [`CooleyTuckeyFFT`] for the given rings, using the given function to create
251    /// the necessary powers of roots of unity.
252    /// 
253    /// Concretely, `root_of_unity_pow(i)` should return `z^i`, where `z` is a `2^log2_n`-th
254    /// primitive root of unity.
255    /// 
256    /// Instead of a ring, this function takes a homomorphism `R -> S`. Twiddle factors that are
257    /// precomputed will be stored as elements of `R`, while the main FFT computations will be 
258    /// performed in `S`. This allows both implicit ring conversions, and using patterns like 
259    /// [`zn_64::ZnFastmul`] to precompute some data for better performance.
260    /// 
261    pub fn new_with_pows_with_hom<F>(hom: H, root_of_unity_pow: F, log2_n: usize) -> Self 
262        where F: FnMut(i64) -> R_twiddle::Element
263    {
264        Self::create(hom, root_of_unity_pow, log2_n, Global)
265    }
266
267    ///
268    /// Creates an [`CooleyTuckeyFFT`] for the given prime fields, assuming they have
269    /// a characteristic congruent to 1 modulo `2^log2_n`.
270    /// 
271    /// Instead of a ring, this function takes a homomorphism `R -> S`. Twiddle factors that are
272    /// precomputed will be stored as elements of `R`, while the main FFT computations will be 
273    /// performed in `S`. This allows both implicit ring conversions, and using patterns like 
274    /// [`zn_64::ZnFastmul`] to precompute some data for better performance.
275    /// 
276    pub fn for_zn_with_hom(hom: H, log2_n: usize) -> Option<Self>
277        where R_twiddle: ZnRing
278    {
279        let ring_as_field = hom.domain().as_field().ok().unwrap();
280        let root_of_unity = ring_as_field.get_ring().unwrap_element(get_prim_root_of_unity_pow2(&ring_as_field, log2_n)?);
281        drop(ring_as_field);
282        Some(Self::new_with_hom(hom, root_of_unity, log2_n))
283    }
284}
285
286impl<R_main, R_twiddle, H, A> PartialEq for CooleyTuckeyFFT<R_main, R_twiddle, H, A> 
287    where R_main: ?Sized + RingBase,
288        R_twiddle: ?Sized + RingBase + DivisibilityRing,
289        H: Homomorphism<R_twiddle, R_main>,
290        A: Allocator
291{
292    fn eq(&self, other: &Self) -> bool {
293        self.ring().get_ring() == other.ring().get_ring() &&
294            self.log2_n == other.log2_n &&
295            self.ring().eq_el(self.root_of_unity(self.ring()), other.root_of_unity(self.ring()))
296    }
297}
298
299impl<R_main, R_twiddle, H, A> Debug for CooleyTuckeyFFT<R_main, R_twiddle, H, A> 
300    where R_main: ?Sized + RingBase + Debug,
301        R_twiddle: ?Sized + RingBase + DivisibilityRing,
302        H: Homomorphism<R_twiddle, R_main>,
303        A: Allocator
304{
305    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
306        f.debug_struct("CooleyTuckeyFFT")
307            .field("ring", &self.ring().get_ring())
308            .field("n", &(1 << self.log2_n))
309            .field("root_of_unity", &self.ring().format(&self.root_of_unity(self.ring())))
310            .finish()
311    }
312}
313
314impl<R_main, R_twiddle, H, A> Clone for CooleyTuckeyFFT<R_main, R_twiddle, H, A> 
315    where R_main: ?Sized + RingBase,
316        R_twiddle: ?Sized + RingBase + DivisibilityRing,
317        H: Homomorphism<R_twiddle, R_main> + Clone,
318        A: Allocator + Clone
319{
320    fn clone(&self) -> Self {
321        Self {
322            two_inv: self.hom.domain().clone_el(&self.two_inv),
323            n_inv: self.hom.domain().clone_el(&self.n_inv),
324            hom: self.hom.clone(),
325            inv_root_of_unity_list: self.inv_root_of_unity_list.iter().map(|list| list.iter().map(|x| self.hom.domain().clone_el(x)).collect()).collect(),
326            root_of_unity_list: self.root_of_unity_list.iter().map(|list| list.iter().map(|x| self.hom.domain().clone_el(x)).collect()).collect(),
327            root_of_unity: self.hom.codomain().clone_el(&self.root_of_unity),
328            log2_n: self.log2_n,
329            allocator: self.allocator.clone()
330        }
331    }
332}
333
334///
335/// A helper trait that defines the Cooley-Tukey butterfly operation.
336/// It is default-implemented for all rings, but for increase FFT performance, some rings
337/// might wish to provide a specialization.
338/// 
339/// # Why not a subtrait of [`Homomorphism`]?
340/// 
341/// With the current design, indeed making this a subtrait of [`Homomorphism`] would
342/// indeed be the conceptually most fitting choice. It would allow specializing on
343/// the twiddle ring, the main ring and the inclusion. 
344/// 
345/// Unfortunately, there is a technical issue: With the current `min_specialization`, 
346/// we can only specialize on concrete type. If this is a subtrait of [`Homomorphism`], this 
347/// means we can only specialize on, say, `CanHom<ZnFastmul, Zn>`, which then does not give a
348/// specialization for `CanHom<&ZnFastmul, Zn>` - in other words, we would specialize on
349/// the [`RingStore`], and not on the [`RingBase`] as we should. Hence, we'll keep this
350/// suboptimal design until full specialization works.
351/// 
352pub trait CooleyTuckeyButterfly<S>: RingBase
353    where S: ?Sized + RingBase
354{
355    ///
356    /// Should compute `(values[i1], values[i2]) := (values[i1] + twiddle * values[i2], values[i1] - twiddle * values[i2])`.
357    /// 
358    /// It is guaranteed that the input elements are either outputs of
359    /// [`CooleyTuckeyButterfly::butterfly()`] or of [`CooleyTuckeyButterfly::prepare_for_fft()`].
360    /// 
361    /// Deprecated in favor of [`CooleyTuckeyButterfly::butterfly_new()`].
362    /// 
363    #[deprecated]
364    fn butterfly<V: VectorViewMut<Self::Element>, H: Homomorphism<S, Self>>(&self, hom: H, values: &mut V, twiddle: &S::Element, i1: usize, i2: usize);
365
366    ///
367    /// Should compute `(values[i1], values[i2]) := (values[i1] + values[i2], (values[i1] - values[i2]) * twiddle)`
368    /// 
369    /// It is guaranteed that the input elements are either outputs of
370    /// [`CooleyTuckeyButterfly::inv_butterfly()`] or of [`CooleyTuckeyButterfly::prepare_for_inv_fft()`].
371    /// 
372    /// Deprecated in favor of [`CooleyTuckeyButterfly::inv_butterfly_new()`].
373    /// 
374    #[deprecated]
375    fn inv_butterfly<V: VectorViewMut<Self::Element>, H: Homomorphism<S, Self>>(&self, hom: H, values: &mut V, twiddle: &S::Element, i1: usize, i2: usize);
376    
377    ///
378    /// Should compute `(x, y) := (x + twiddle * y, x - twiddle * y)`.
379    /// 
380    /// It is guaranteed that the input elements are either outputs of
381    /// [`CooleyTuckeyButterfly::butterfly_new()`] or of [`CooleyTuckeyButterfly::prepare_for_fft()`].
382    /// 
383    fn butterfly_new<H: Homomorphism<S, Self>>(hom: H, x: &mut Self::Element, y: &mut Self::Element, twiddle: &S::Element);
384    
385    ///
386    /// Should compute `(x, y) := (x + y, (x - y) * twiddle)`
387    /// 
388    /// It is guaranteed that the input elements are either outputs of
389    /// [`CooleyTuckeyButterfly::inv_butterfly_new()`] or of [`CooleyTuckeyButterfly::prepare_for_inv_fft()`].
390    /// 
391    fn inv_butterfly_new<H: Homomorphism<S, Self>>(hom: H, x: &mut Self::Element, y: &mut Self::Element, twiddle: &S::Element);
392
393    ///
394    /// Possibly pre-processes elements before the FFT starts. Here you can
395    /// bring ring element into a certain form, and assume during [`CooleyTuckeyButterfly::butterfly_new()`]
396    /// that the inputs are in this form.
397    /// 
398    #[inline(always)]
399    fn prepare_for_fft(&self, _value: &mut Self::Element) {}
400    
401    ///
402    /// Possibly pre-processes elements before the inverse FFT starts. Here you can
403    /// bring ring element into a certain form, and assume during [`CooleyTuckeyButterfly::inv_butterfly_new()`]
404    /// that the inputs are in this form.
405    /// 
406    #[inline(always)]
407    fn prepare_for_inv_fft(&self, _value: &mut Self::Element) {}
408}
409
410#[allow(deprecated)]
411impl<R, S> CooleyTuckeyButterfly<S> for R
412    where S: ?Sized + RingBase, R: ?Sized + RingBase
413{
414    #[inline(always)]
415    default fn butterfly<V: VectorViewMut<Self::Element>, H: Homomorphism<S, Self>>(&self, hom: H, values: &mut V, twiddle: &<S as RingBase>::Element, i1: usize, i2: usize) {
416        hom.mul_assign_ref_map(values.at_mut(i2), twiddle);
417        let new_a = self.add_ref(values.at(i1), values.at(i2));
418        let a = std::mem::replace(values.at_mut(i1), new_a);
419        self.sub_self_assign(values.at_mut(i2), a);
420    }
421
422    #[inline(always)]
423    #[allow(deprecated)]
424    default fn butterfly_new<H: Homomorphism<S, Self>>(hom: H, x: &mut Self::Element, y: &mut Self::Element, twiddle: &S::Element) {
425        let mut values = [hom.codomain().clone_el(x), hom.codomain().clone_el(y)];
426        <Self as CooleyTuckeyButterfly<S>>::butterfly(hom.codomain().get_ring(), &hom, &mut values, twiddle, 0, 1);
427        [*x, *y] = values;
428    }
429
430    #[inline(always)]
431    default fn inv_butterfly<V: VectorViewMut<Self::Element>, H: Homomorphism<S, Self>>(&self, hom: H, values: &mut V, twiddle: &<S as RingBase>::Element, i1: usize, i2: usize) {
432        let new_a = self.add_ref(values.at(i1), values.at(i2));
433        let a = std::mem::replace(values.at_mut(i1), new_a);
434        self.sub_self_assign(values.at_mut(i2), a);
435        hom.mul_assign_ref_map(values.at_mut(i2), twiddle);
436    }
437    
438    #[inline(always)]
439    #[allow(deprecated)]
440    default fn inv_butterfly_new<H: Homomorphism<S, Self>>(hom: H, x: &mut Self::Element, y: &mut Self::Element, twiddle: &S::Element) {
441        let mut values = [hom.codomain().clone_el(x), hom.codomain().clone_el(y)];
442        <Self as CooleyTuckeyButterfly<S>>::inv_butterfly(hom.codomain().get_ring(), &hom, &mut values, twiddle, 0, 1);
443        [*x, *y] = values;
444    }
445
446    #[inline(always)]
447    default fn prepare_for_fft(&self, _value: &mut Self::Element) {}
448    
449    #[inline(always)]
450    default fn prepare_for_inv_fft(&self, _value: &mut Self::Element) {}
451}
452
453impl<R_main, R_twiddle, H, A> CooleyTuckeyFFT<R_main, R_twiddle, H, A> 
454    where R_main: ?Sized + RingBase,
455        R_twiddle: ?Sized + RingBase + DivisibilityRing,
456        H: Homomorphism<R_twiddle, R_main>,
457        A: Allocator
458{
459    ///
460    /// Most general way to create a [`CooleyTuckeyFFT`].
461    /// 
462    /// This is currently the same as [`CooleyTuckeyFFT::new_with_pows_with_hom()`], except
463    /// that it additionally accepts an allocator, which is used to copy the input data in
464    /// cases where the input data layout is not optimal for the algorithm.
465    /// 
466    #[stability::unstable(feature = "enable")]
467    pub fn create<F>(hom: H, mut root_of_unity_pow: F, log2_n: usize, allocator: A) -> Self 
468        where F: FnMut(i64) -> R_twiddle::Element
469    {
470        let ring = hom.domain();
471        assert!(ring.is_commutative());
472        assert!(ring.get_ring().is_approximate() || is_prim_root_of_unity_pow2(&ring, &root_of_unity_pow(1), log2_n));
473        assert!(hom.codomain().get_ring().is_approximate() || is_prim_root_of_unity_pow2(&hom.codomain(), &hom.map(root_of_unity_pow(1)), log2_n));
474
475        let root_of_unity_list = Self::create_root_of_unity_list(|i| root_of_unity_pow(-i), log2_n);
476        let inv_root_of_unity_list = Self::create_root_of_unity_list(|i| root_of_unity_pow(i), log2_n);
477        let root_of_unity = root_of_unity_pow(1);
478        
479        let store_twiddle_ring = root_of_unity_list.len();
480        CooleyTuckeyFFT {
481            root_of_unity_list: root_of_unity_list.into_iter().take(store_twiddle_ring).collect(),
482            inv_root_of_unity_list: inv_root_of_unity_list.into_iter().take(store_twiddle_ring).collect(),
483            two_inv: hom.domain().invert(&hom.domain().int_hom().map(2)).unwrap(),
484            n_inv: hom.domain().invert(&hom.domain().int_hom().map(1 << log2_n)).unwrap(),
485            root_of_unity: hom.map(root_of_unity), 
486            hom, 
487            log2_n, 
488            allocator
489        }
490    }
491
492    ///
493    /// Replaces the ring that this object can compute FFTs over, assuming that the current
494    /// twiddle factors can be mapped into the new ring with the given homomorphism.
495    /// 
496    /// In particular, this function does not recompute twiddles, but uses a different
497    /// homomorphism to map the current twiddles into a new ring. Hence, it is extremely
498    /// cheap. 
499    /// 
500    #[stability::unstable(feature = "enable")]
501    pub fn change_ring<R_new: ?Sized + RingBase, H_new: Homomorphism<R_twiddle, R_new>>(self, new_hom: H_new) -> (CooleyTuckeyFFT<R_new, R_twiddle, H_new, A>, H) {
502        let ring = new_hom.codomain();
503        let root_of_unity = if self.log2_n == 0 {
504            new_hom.codomain().one()
505        } else {
506            new_hom.map_ref(&self.inv_root_of_unity_list[self.log2_n - 1][bitreverse(1, self.log2_n - 1)])
507        };
508        assert!(ring.is_commutative());
509        assert!(ring.get_ring().is_approximate() || is_prim_root_of_unity_pow2(&ring, &root_of_unity, self.log2_n));
510
511        return (
512            CooleyTuckeyFFT {
513                root_of_unity_list: self.root_of_unity_list,
514                inv_root_of_unity_list: self.inv_root_of_unity_list,
515                two_inv: self.two_inv,
516                n_inv: self.n_inv,
517                root_of_unity: root_of_unity, 
518                hom: new_hom, 
519                log2_n: self.log2_n, 
520                allocator: self.allocator
521            },
522            self.hom
523        );
524    }
525
526    fn create_root_of_unity_list<F>(mut root_of_unity_pow: F, log2_n: usize) -> Vec<Vec<R_twiddle::Element>>
527        where F: FnMut(i64) -> R_twiddle::Element
528    {
529        let mut twiddles: Vec<Vec<R_twiddle::Element>> = (0..log2_n).map(|_| Vec::new()).collect();
530        for log2_step in 0..log2_n {
531            let butterfly_count = 1 << log2_step;
532            for i in 0..butterfly_count {
533                twiddles[log2_step].push(root_of_unity_pow(bitreverse(i, log2_n - 1) as i64));
534            }
535        }
536        return twiddles;
537    }
538
539    ///
540    /// Returns the ring over which this object can compute FFTs.
541    /// 
542    pub fn ring<'a>(&'a self) -> &'a <H as Homomorphism<R_twiddle, R_main>>::CodomainStore {
543        self.hom.codomain()
544    }
545
546    /// 
547    /// Computes the main butterfly step, either forward or backward (without division by two).
548    /// 
549    /// The forward butterfly is
550    /// ```text
551    ///   (a, b) -> (a + twiddle * b, a - twiddle * b)
552    /// ```
553    /// The backward butterfly is
554    /// ```text
555    ///   (u, v) -> (u + v, twiddle * (u - v))
556    /// ```
557    /// 
558    /// The `#[inline(never)]` here is absolutely important for performance!
559    /// No idea why...
560    /// 
561    #[inline(never)]
562    fn butterfly_step_main<const INV: bool, const IS_PREPARED: bool>(&self, data: &mut [R_main::Element], butterfly_range: Range<usize>, stride_range: Range<usize>, log2_step: usize) {
563        let twiddles = if INV {
564            &self.inv_root_of_unity_list[log2_step]
565        } else {
566            &self.root_of_unity_list[log2_step]
567        };
568        // let start = std::time::Instant::now();
569        let butterfly = |a: &mut _, b: &mut _, twiddle: &_| {
570            if INV {
571                if !IS_PREPARED {
572                    <R_main as CooleyTuckeyButterfly<R_twiddle>>::prepare_for_inv_fft(self.ring().get_ring(), a);
573                    <R_main as CooleyTuckeyButterfly<R_twiddle>>::prepare_for_inv_fft(self.ring().get_ring(), b);
574                }
575                <R_main as CooleyTuckeyButterfly<R_twiddle>>::inv_butterfly_new(&self.hom, a, b, twiddle);
576            } else {
577                if !IS_PREPARED {
578                    <R_main as CooleyTuckeyButterfly<R_twiddle>>::prepare_for_fft(self.ring().get_ring(), a);
579                    <R_main as CooleyTuckeyButterfly<R_twiddle>>::prepare_for_fft(self.ring().get_ring(), b);
580                }
581                <R_main as CooleyTuckeyButterfly<R_twiddle>>::butterfly_new(&self.hom, a, b, twiddle);
582            }
583        };
584        butterfly_loop(self.log2_n, data, butterfly_range, stride_range, log2_step, twiddles, butterfly);
585        // let end = std::time::Instant::now();
586        // BUTTERFLY_TIMES[log2_step].fetch_add((end - start).as_micros() as usize, std::sync::atomic::Ordering::Relaxed);
587    }
588    
589    ///
590    /// The definitions are
591    /// ```text
592    ///   u = a/2 + twiddle * b/2,
593    ///   v = a/2 - twiddle * b/2
594    /// ```
595    /// 
596    #[inline(never)]
597    fn butterfly_ub_from_ab(&self, data: &mut [R_main::Element], butterfly_range: Range<usize>, stride_range: Range<usize>, log2_step: usize) {
598        butterfly_loop(self.log2_n, data, butterfly_range, stride_range, log2_step, &self.root_of_unity_list[log2_step], |a, b, twiddle| {
599            *a = self.hom.mul_ref_snd_map(
600                self.ring().add_ref_fst(a, self.hom.mul_ref_map(b, twiddle)),
601                &self.two_inv
602            );
603        });
604    }
605
606    ///
607    /// The definitions are
608    /// ```text
609    ///   u = a/2 + twiddle * b/2,
610    ///   v = a/2 - twiddle * b/2
611    /// ```
612    /// 
613    #[inline(never)]
614    fn butterfly_uv_from_ub(&self, data: &mut [R_main::Element], butterfly_range: Range<usize>, stride_range: Range<usize>, log2_step: usize) {
615        butterfly_loop(self.log2_n, data, butterfly_range, stride_range, log2_step, &self.root_of_unity_list[log2_step], |a, b, twiddle| {
616            *b = self.ring().sub_ref_fst(a, self.hom.mul_ref_map(b, twiddle));
617        });
618    }
619
620    ///
621    /// The definitions are
622    /// ```text
623    ///   u = a/2 + twiddle * b/2,
624    ///   v = a/2 - twiddle * b/2
625    /// ```
626    /// 
627    #[inline(never)]
628    fn butterfly_ab_from_ub(&self, data: &mut [R_main::Element], butterfly_range: Range<usize>, stride_range: Range<usize>, log2_step: usize) {
629        butterfly_loop(self.log2_n, data, butterfly_range, stride_range, log2_step, &self.root_of_unity_list[log2_step], |a, b, twiddle| {
630            *a = self.ring().add_ref(a, a);
631            self.ring().sub_assign(a, self.hom.mul_ref_map(b, twiddle));
632        });
633    }
634
635    ///
636    /// Returns a reference to the allocator currently used for temporary allocations by this FFT.
637    /// 
638    #[stability::unstable(feature = "enable")]
639    pub fn allocator(&self) -> &A {
640        &self.allocator
641    }
642
643    ///
644    /// Replaces the allocator used for temporary allocations by this FFT.
645    /// 
646    #[stability::unstable(feature = "enable")]
647    pub fn with_allocator<A_new: Allocator>(self, allocator: A_new) -> CooleyTuckeyFFT<R_main, R_twiddle, H, A_new> {
648        CooleyTuckeyFFT {
649            root_of_unity_list: self.root_of_unity_list,
650            inv_root_of_unity_list: self.inv_root_of_unity_list,
651            two_inv: self.two_inv,
652            n_inv: self.n_inv,
653            root_of_unity: self.root_of_unity, 
654            hom: self.hom, 
655            log2_n: self.log2_n, 
656            allocator: allocator
657        }
658    }
659
660    ///
661    /// Returns a reference to the homomorphism that is used to map the stored twiddle
662    /// factors into main ring, over which FFTs are computed.
663    /// 
664    #[stability::unstable(feature = "enable")]
665    pub fn hom(&self) -> &H {
666        &self.hom
667    }
668
669    ///
670    /// Computes the unordered, truncated FFT.
671    /// 
672    /// The truncated FFT is the standard DFT, applied to a list for which only the first
673    /// `nonzero_entries` entries are nonzero, and the (bitreversed) result truncated to
674    /// length `nonzero_entries`.
675    /// 
676    /// Therefore, this function is equivalent to the following pseudocode
677    /// ```text
678    /// data[nonzero_entries..] = 0;
679    /// unordered_fft(data);
680    /// data[nonzero_entries] = unspecified;
681    /// ```
682    /// 
683    /// It can be inverted using [`CooleyTuckey::unordered_truncated_fft_inv()`].
684    /// 
685    #[stability::unstable(feature = "enable")]
686    pub fn unordered_truncated_fft(&self, data: &mut [R_main::Element], nonzero_entries: usize) {
687        assert_eq!(self.len(), data.len());
688        assert!(nonzero_entries > self.len() / 2);
689        assert!(nonzero_entries <= self.len());
690        for i in nonzero_entries..self.len() {
691            debug_assert!(self.ring().get_ring().is_approximate() || self.ring().is_zero(&data[i]));
692        }
693
694        for i in 0..data.len() {
695            <R_main as CooleyTuckeyButterfly<R_twiddle>>::prepare_for_fft(self.ring().get_ring(), &mut data[i]);
696        }
697        for log2_step in 0..self.log2_n {
698            let stride = 1 << (self.log2_n - log2_step - 1);
699            let butterfly_count = nonzero_entries.div_ceil(2 * stride);
700            self.butterfly_step_main::<false, true>(data, 0..butterfly_count, 0..stride, log2_step);
701        }
702    }
703    
704    ///
705    /// Computes the inverse of the unordered, truncated FFT.
706    /// 
707    /// The truncated FFT is the standard DFT, applied to a list for which only the first
708    /// `nonzero_entries` entries are nonzero, and the (bitreversed) result truncated to
709    /// length `nonzero_entries`. Therefore, this function computes a list of `nonzero_entries`
710    /// many values, followed by zeros, whose DFT agrees with the input on the first `nonzero_entries`
711    /// many elements.
712    /// 
713    #[stability::unstable(feature = "enable")]
714    pub fn unordered_truncated_fft_inv(&self, data: &mut [R_main::Element], nonzero_entries: usize) {   
715        assert_eq!(self.len(), data.len());
716        assert!(nonzero_entries > self.len() / 2);
717        assert!(nonzero_entries <= self.len());
718
719        for i in 0..data.len() {
720            <R_main as CooleyTuckeyButterfly<R_twiddle>>::prepare_for_inv_fft(self.ring().get_ring(), &mut data[i]);
721        }
722        for log2_step in (0..self.log2_n).rev() {
723            let stride = 1 << (self.log2_n - log2_step - 1);
724            let current_block = nonzero_entries / (2 * stride);
725            self.butterfly_step_main::<true, true>(data, 0..current_block, 0..stride, log2_step);
726        }
727        if nonzero_entries < (1 << self.log2_n) {
728            for i in nonzero_entries..(1 << self.log2_n) {
729                data[i] = self.ring().zero();
730            }
731            for log2_step in 0..self.log2_n {
732                let stride = 1 << (self.log2_n - log2_step - 1);
733                let current_block = nonzero_entries / (2 * stride);
734                let known_area = nonzero_entries % (2 * stride);
735                if known_area >= stride {
736                    self.butterfly_uv_from_ub(data, current_block..(current_block + 1), (known_area - stride)..stride, log2_step);
737                } else {
738                    self.butterfly_ub_from_ab(data, current_block..(current_block + 1), known_area..stride, log2_step);
739                }
740            }
741            for log2_step in (0..self.log2_n).rev() {
742                let stride = 1 << (self.log2_n - log2_step - 1);
743                let current_block = nonzero_entries / (2 * stride);
744                let known_area = nonzero_entries % (2 * stride);
745                if known_area >= stride {
746                    self.butterfly_step_main::<true, false>(data, current_block..(current_block + 1), 0..stride, log2_step);
747                } else {
748                    self.butterfly_ab_from_ub(data, current_block..(current_block + 1), 0..stride, log2_step);
749                }
750            }
751        }
752        for i in 0..(1 << self.log2_n) {
753            self.hom.mul_assign_ref_map(&mut data[i], &self.n_inv);
754        }
755    }
756    
757    ///
758    /// Permutes the given list of length `n` according to `values[bitreverse(i, log2(n))] = values[i]`.
759    /// This is exactly the permutation that is implicitly applied by [`CooleyTuckeyFFT::unordered_fft()`].
760    /// 
761    pub fn bitreverse_permute_inplace<V, T>(&self, mut values: V) 
762        where V: SwappableVectorViewMut<T>
763    {
764        assert!(values.len() == 1 << self.log2_n);
765        for i in 0..(1 << self.log2_n) {
766            if bitreverse(i, self.log2_n) < i {
767                values.swap(i, bitreverse(i, self.log2_n));
768            }
769        }
770    }
771}
772
773impl<R_main, R_twiddle, H, A> FFTAlgorithm<R_main> for CooleyTuckeyFFT<R_main, R_twiddle, H, A> 
774    where R_main: ?Sized + RingBase,
775        R_twiddle: ?Sized + RingBase + DivisibilityRing,
776        H: Homomorphism<R_twiddle, R_main>,
777        A: Allocator
778{
779    fn len(&self) -> usize {
780        1 << self.log2_n
781    }
782
783    fn root_of_unity<S: Copy + RingStore<Type = R_main>>(&self, ring: S) -> &R_main::Element {
784        assert!(ring.get_ring() == self.ring().get_ring(), "unsupported ring");
785        &self.root_of_unity
786    }
787
788    fn unordered_fft_permutation(&self, i: usize) -> usize {
789        bitreverse(i, self.log2_n)
790    }
791
792    fn unordered_fft_permutation_inv(&self, i: usize) -> usize {
793        bitreverse(i, self.log2_n)
794    }
795
796    fn fft<V, S>(&self, mut values: V, ring: S)
797        where V: SwappableVectorViewMut<<R_main as RingBase>::Element>,
798            S: RingStore<Type = R_main> + Copy 
799    {
800        assert!(ring.get_ring() == self.ring().get_ring(), "unsupported ring");
801        assert_eq!(self.len(), values.len());
802        self.unordered_fft(&mut values, ring);
803        self.bitreverse_permute_inplace(&mut values);
804    }
805
806    fn inv_fft<V, S>(&self, mut values: V, ring: S)
807        where V: SwappableVectorViewMut<<R_main as RingBase>::Element>,
808            S: RingStore<Type = R_main> + Copy 
809    {
810        assert!(ring.get_ring() == self.ring().get_ring(), "unsupported ring");
811        assert_eq!(self.len(), values.len());
812        self.bitreverse_permute_inplace(&mut values);
813        self.unordered_inv_fft(&mut values, ring);
814    }
815
816    fn unordered_fft<V, S>(&self, mut values: V, ring: S)
817        where V: SwappableVectorViewMut<<R_main as RingBase>::Element>,
818            S: RingStore<Type = R_main> + Copy 
819    {
820        assert!(ring.get_ring() == self.ring().get_ring(), "unsupported ring");
821        assert_eq!(self.len(), values.len());
822        if let Some(data) = values.as_slice_mut() {
823            self.unordered_truncated_fft(data, 1 << self.log2_n);
824        } else {
825            let mut data = Vec::with_capacity_in(1 << self.log2_n, &self.allocator);
826            data.extend(values.clone_ring_els(ring).iter());
827            self.unordered_truncated_fft(&mut data, 1 << self.log2_n);
828            for (i, x) in data.into_iter().enumerate() {
829                *values.at_mut(i) = x;
830            }
831        }
832    }
833    
834    fn unordered_inv_fft<V, S>(&self, mut values: V, ring: S)
835        where V: SwappableVectorViewMut<<R_main as RingBase>::Element>,
836            S: RingStore<Type = R_main> + Copy 
837    {
838        assert!(ring.get_ring() == self.ring().get_ring(), "unsupported ring");
839        assert_eq!(self.len(), values.len());
840        if let Some(data) = values.as_slice_mut() {
841            self.unordered_truncated_fft_inv(data, 1 << self.log2_n);
842        } else {
843            let mut data = Vec::with_capacity_in(1 << self.log2_n, &self.allocator);
844            data.extend(values.clone_ring_els(ring).iter());
845            self.unordered_truncated_fft_inv(&mut data, 1 << self.log2_n);
846            for (i, x) in data.into_iter().enumerate() {
847                *values.at_mut(i) = x;
848            }
849        }
850    }
851}
852
853impl<H, A> FFTErrorEstimate for CooleyTuckeyFFT<Complex64Base, Complex64Base, H, A> 
854    where H: Homomorphism<Complex64Base, Complex64Base>,
855        A: Allocator
856{
857    fn expected_absolute_error(&self, input_bound: f64, input_error: f64) -> f64 {
858        // the butterfly performs a multiplication with a root of unity, and an addition
859        let multiply_absolute_error = input_bound * root_of_unity_error() + input_bound * f64::EPSILON;
860        let addition_absolute_error = input_bound * f64::EPSILON;
861        let butterfly_absolute_error = multiply_absolute_error + addition_absolute_error;
862        // the operator inf-norm of the FFT is its length
863        return 2. * self.len() as f64 * butterfly_absolute_error + self.len() as f64 * input_error;
864    }
865}
866
867#[cfg(test)]
868use crate::primitive_int::*;
869#[cfg(test)]
870use crate::rings::zn::zn_static::Fp;
871#[cfg(test)]
872use crate::rings::zn::zn_big;
873#[cfg(test)]
874use crate::rings::zn::zn_static;
875#[cfg(test)]
876use crate::field::*;
877#[cfg(test)]
878use crate::rings::finite::FiniteRingStore;
879
880#[test]
881fn test_bitreverse_fft_inplace_basic() {
882    let ring = Fp::<5>::RING;
883    let z = ring.int_hom().map(2);
884    let fft = CooleyTuckeyFFT::new(ring, ring.div(&1, &z), 2);
885    let mut values = [1, 0, 0, 1];
886    let expected = [2, 4, 0, 3];
887    let mut bitreverse_expected = [0; 4];
888    for i in 0..4 {
889        bitreverse_expected[i] = expected[bitreverse(i, 2)];
890    }
891
892    fft.unordered_fft(&mut values, ring);
893    assert_eq!(values, bitreverse_expected);
894}
895
896#[test]
897fn test_bitreverse_fft_inplace_advanced() {
898    let ring = Fp::<17>::RING;
899    let z = ring.int_hom().map(3);
900    let fft = CooleyTuckeyFFT::new(ring, z, 4);
901    let mut values = [1, 0, 0, 0, 1, 0, 0, 0, 4, 3, 2, 1, 4, 3, 2, 1];
902    let expected = [5, 2, 0, 11, 5, 4, 0, 6, 6, 13, 0, 1, 7, 6, 0, 1];
903    let mut bitreverse_expected = [0; 16];
904    for i in 0..16 {
905        bitreverse_expected[i] = expected[bitreverse(i, 4)];
906    }
907
908    fft.unordered_fft(&mut values, ring);
909    assert_eq!(values, bitreverse_expected);
910}
911
912#[test]
913fn test_unordered_fft_permutation() {
914    let ring = Fp::<17>::RING;
915    let fft = CooleyTuckeyFFT::for_zn(&ring, 4).unwrap();
916    let mut values = [0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
917    let mut expected = [0; 16];
918    for i in 0..16 {
919        let power_of_zeta = ring.pow(*fft.root_of_unity(&ring), 16 - fft.unordered_fft_permutation(i));
920        expected[i] = ring.add(power_of_zeta, ring.pow(power_of_zeta, 4));
921    }
922    fft.unordered_fft(&mut values, ring);
923    assert_eq!(expected, values);
924}
925
926#[test]
927fn test_bitreverse_inv_fft_inplace() {
928    let ring = Fp::<17>::RING;
929    let fft = CooleyTuckeyFFT::for_zn(&ring, 4).unwrap();
930    let values: [u64; 16] = [1, 2, 3, 2, 1, 0, 17 - 1, 17 - 2, 17 - 1, 0, 1, 2, 3, 4, 5, 6];
931    let mut work = values;
932    fft.unordered_fft(&mut work, ring);
933    fft.unordered_inv_fft(&mut work, ring);
934    assert_eq!(&work, &values);
935}
936
937#[test]
938fn test_truncated_fft() {
939    let ring = Fp::<17>::RING;
940    let fft = CooleyTuckeyFFT::new(ring, 2, 3);
941
942    let data = [2, 3, 0, 1, 1, 0, 0, 0];
943    let mut complete_fft = data;
944    fft.unordered_fft(&mut complete_fft, ring);
945    for k in 5..=8 {
946        println!("{}", k);
947        let mut truncated_fft = data;
948        fft.unordered_truncated_fft(&mut truncated_fft, k);
949        assert_eq!(&complete_fft[..k], &truncated_fft[..k]);
950
951        fft.unordered_truncated_fft_inv(&mut truncated_fft, k);
952        assert_eq!(data, truncated_fft);
953    }
954}
955
956#[test]
957fn test_for_zn() {
958    let ring = Fp::<17>::RING;
959    let fft = CooleyTuckeyFFT::for_zn(ring, 4).unwrap();
960    assert!(ring.is_neg_one(&ring.pow(fft.root_of_unity, 8)));
961
962    let ring = Fp::<97>::RING;
963    let fft = CooleyTuckeyFFT::for_zn(ring, 4).unwrap();
964    assert!(ring.is_neg_one(&ring.pow(fft.root_of_unity, 8)));
965}
966
967#[cfg(test)]
968fn run_fft_bench_round<R, S, H>(fft: &CooleyTuckeyFFT<S, R, H>, data: &Vec<S::Element>, copy: &mut Vec<S::Element>)
969    where R: ZnRing, S: ZnRing, H: Homomorphism<R, S>
970{
971    copy.clear();
972    copy.extend(data.iter().map(|x| fft.ring().clone_el(x)));
973    fft.unordered_fft(&mut copy[..], &fft.ring());
974    fft.unordered_inv_fft(&mut copy[..], &fft.ring());
975    assert_el_eq!(fft.ring(), copy[0], data[0]);
976}
977
978#[cfg(test)]
979const BENCH_SIZE_LOG2: usize = 13;
980
981#[bench]
982fn bench_fft_zn_big(bencher: &mut test::Bencher) {
983    let ring = zn_big::Zn::new(StaticRing::<i128>::RING, 1073872897);
984    let fft = CooleyTuckeyFFT::for_zn(&ring, BENCH_SIZE_LOG2).unwrap();
985    let data = (0..(1 << BENCH_SIZE_LOG2)).map(|i| ring.int_hom().map(i)).collect::<Vec<_>>();
986    let mut copy = Vec::with_capacity(1 << BENCH_SIZE_LOG2);
987    bencher.iter(|| {
988        run_fft_bench_round(&fft, &data, &mut copy)
989    });
990}
991
992#[bench]
993fn bench_fft_zn_64(bencher: &mut test::Bencher) {
994    let ring = zn_64::Zn::new(1073872897);
995    let fft = CooleyTuckeyFFT::for_zn(&ring, BENCH_SIZE_LOG2).unwrap();
996    let data = (0..(1 << BENCH_SIZE_LOG2)).map(|i| ring.int_hom().map(i)).collect::<Vec<_>>();
997    let mut copy = Vec::with_capacity(1 << BENCH_SIZE_LOG2);
998    bencher.iter(|| {
999        run_fft_bench_round(&fft, &data, &mut copy)
1000    });
1001}
1002
1003#[bench]
1004fn bench_fft_zn_64_fastmul(bencher: &mut test::Bencher) {
1005    let ring = zn_64::Zn::new(1073872897);
1006    let fastmul_ring = zn_64::ZnFastmul::new(ring).unwrap();
1007    let fft = CooleyTuckeyFFT::for_zn_with_hom(ring.into_can_hom(fastmul_ring).ok().unwrap(), BENCH_SIZE_LOG2).unwrap();
1008    let data = (0..(1 << BENCH_SIZE_LOG2)).map(|i| ring.int_hom().map(i)).collect::<Vec<_>>();
1009    let mut copy = Vec::with_capacity(1 << BENCH_SIZE_LOG2);
1010    bencher.iter(|| {
1011        run_fft_bench_round(&fft, &data, &mut copy)
1012    });
1013}
1014
1015#[test]
1016fn test_approximate_fft() {
1017    let CC = Complex64::RING;
1018    for log2_n in [4, 7, 11, 15] {
1019        let fft = CooleyTuckeyFFT::new_with_pows(CC, |x| CC.root_of_unity(x, 1 << log2_n), log2_n);
1020        let mut array = (0..(1 << log2_n)).map(|i|  CC.root_of_unity(i.try_into().unwrap(), 1 << log2_n)).collect::<Vec<_>>();
1021        fft.fft(&mut array, CC);
1022        let err = fft.expected_absolute_error(1., 0.);
1023        assert!(CC.is_absolute_approx_eq(array[0], CC.zero(), err));
1024        assert!(CC.is_absolute_approx_eq(array[1], CC.from_f64(fft.len() as f64), err));
1025        for i in 2..fft.len() {
1026            assert!(CC.is_absolute_approx_eq(array[i], CC.zero(), err));
1027        }
1028    }
1029}
1030
1031#[test]
1032fn test_size_1_fft() {
1033    let ring = Fp::<17>::RING;
1034    let fft = CooleyTuckeyFFT::for_zn(&ring, 0).unwrap().change_ring(ring.identity()).0;
1035    let values: [u64; 1] = [3];
1036    let mut work = values;
1037    fft.unordered_fft(&mut work, ring);
1038    assert_eq!(&work, &values);
1039    fft.unordered_inv_fft(&mut work, ring);
1040    assert_eq!(&work, &values);
1041    assert_eq!(0, fft.unordered_fft_permutation(0));
1042    assert_eq!(0, fft.unordered_fft_permutation_inv(0));
1043}
1044
1045#[cfg(any(test, feature = "generic_tests"))]
1046pub fn generic_test_cooley_tuckey_butterfly<R: RingStore, S: RingStore, I: Iterator<Item = El<R>>>(ring: R, base: S, edge_case_elements: I, test_twiddle: &El<S>)
1047    where R::Type: CanHomFrom<S::Type>,
1048        S::Type: DivisibilityRing
1049{
1050    let test_inv_twiddle = base.invert(&test_twiddle).unwrap();
1051    let elements = edge_case_elements.collect::<Vec<_>>();
1052    let hom = ring.can_hom(&base).unwrap();
1053
1054    for a in &elements {
1055        for b in &elements {
1056
1057            let [mut x, mut y] = [ring.clone_el(a), ring.clone_el(b)];
1058            <R::Type as CooleyTuckeyButterfly<S::Type>>::butterfly_new(&hom, &mut x, &mut y, &test_twiddle);
1059            assert_el_eq!(ring, ring.add_ref_fst(a, ring.mul_ref_fst(b, hom.map_ref(test_twiddle))), &x);
1060            assert_el_eq!(ring, ring.sub_ref_fst(a, ring.mul_ref_fst(b, hom.map_ref(test_twiddle))), &y);
1061
1062            <R::Type as CooleyTuckeyButterfly<S::Type>>::inv_butterfly_new(&hom, &mut x, &mut y, &test_inv_twiddle);
1063            assert_el_eq!(ring, ring.int_hom().mul_ref_fst_map(a, 2), &x);
1064            assert_el_eq!(ring, ring.int_hom().mul_ref_fst_map(b, 2), &y);
1065
1066            let [mut x, mut y] = [ring.clone_el(a), ring.clone_el(b)];
1067            <R::Type as CooleyTuckeyButterfly<S::Type>>::inv_butterfly_new(&hom, &mut x, &mut y, &test_twiddle);
1068            assert_el_eq!(ring, ring.add_ref(a, b), &x);
1069            assert_el_eq!(ring, ring.mul(ring.sub_ref(a, b), hom.map_ref(test_twiddle)), &y);
1070
1071            <R::Type as CooleyTuckeyButterfly<S::Type>>::butterfly_new(&hom, &mut x, &mut y, &test_inv_twiddle);
1072            assert_el_eq!(ring, ring.int_hom().mul_ref_fst_map(a, 2), &x);
1073            assert_el_eq!(ring, ring.int_hom().mul_ref_fst_map(b, 2), &y);
1074        }
1075    }
1076}
1077
1078#[test]
1079fn test_butterfly() {
1080    generic_test_cooley_tuckey_butterfly(zn_static::F17, zn_static::F17, zn_static::F17.elements(), &get_prim_root_of_unity_pow2(zn_static::F17, 4).unwrap());
1081}