Skip to main content

snarkvm_algorithms/fft/
domain.rs

1// Copyright (c) 2019-2026 Provable Inc.
2// This file is part of the snarkVM library.
3
4// Licensed under the Apache License, Version 2.0 (the "License");
5// you may not use this file except in compliance with the License.
6// You may obtain a copy of the License at:
7
8// http://www.apache.org/licenses/LICENSE-2.0
9
10// Unless required by applicable law or agreed to in writing, software
11// distributed under the License is distributed on an "AS IS" BASIS,
12// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13// See the License for the specific language governing permissions and
14// limitations under the License.
15
16//! This module contains an `EvaluationDomain` abstraction for
17//! performing various kinds of polynomial arithmetic on top of
18//! the scalar field.
19//!
20//! In pairing-based SNARKs like GM17, we need to calculate
21//! a quotient polynomial over a target polynomial with roots
22//! at distinct points associated with each constraint of the
23//! constraint system. In order to be efficient, we choose these
24//! roots to be the powers of a 2^n root of unity in the field.
25//! This allows us to perform polynomial operations in O(n)
26//! by performing an O(n log n) FFT over such a domain.
27
28use crate::{
29    cfg_chunks_mut,
30    cfg_into_iter,
31    cfg_iter,
32    cfg_iter_mut,
33    fft::{DomainCoeff, SparsePolynomial},
34};
35use snarkvm_fields::{FftField, FftParameters, Field, batch_inversion};
36#[cfg(not(feature = "serial"))]
37use snarkvm_utilities::max_available_threads;
38use snarkvm_utilities::{execute_with_max_available_threads, serialize::*};
39
40use rand::Rng;
41use std::{borrow::Cow, fmt};
42
43use anyhow::{Result, ensure};
44
45#[cfg(feature = "serial")]
46use itertools::Itertools;
47#[cfg(not(feature = "serial"))]
48use rayon::prelude::*;
49
50/// Returns the ceiling of the base-2 logarithm of `x`.
51///
52/// ```
53/// use snarkvm_algorithms::fft::domain::log2;
54///
55/// assert_eq!(log2(16), 4);
56/// assert_eq!(log2(17), 5);
57/// assert_eq!(log2(1), 0);
58/// assert_eq!(log2(0), 0);
59/// assert_eq!(log2(usize::MAX), (core::mem::size_of::<usize>() * 8) as u32);
60/// assert_eq!(log2(1 << 15), 15);
61/// assert_eq!(log2(2usize.pow(18)), 18);
62/// ```
63pub fn log2(x: usize) -> u32 {
64    if x == 0 {
65        0
66    } else if x.is_power_of_two() {
67        1usize.leading_zeros() - x.leading_zeros()
68    } else {
69        0usize.leading_zeros() - x.leading_zeros()
70    }
71}
72
73// minimum size of a parallelized chunk
74#[allow(unused)]
75#[cfg(not(feature = "serial"))]
76const MIN_PARALLEL_CHUNK_SIZE: usize = 1 << 7;
77
78/// Defines a domain over which finite field (I)FFTs can be performed. Works
79/// only for fields that have a large multiplicative subgroup of size that is
80/// a power-of-2.
81#[derive(Copy, Clone, Hash, Eq, PartialEq, CanonicalSerialize, CanonicalDeserialize)]
82pub struct EvaluationDomain<F: FftField> {
83    /// The size of the domain.
84    pub size: u64,
85    /// `log_2(self.size)`.
86    pub log_size_of_group: u32,
87    /// Size of the domain as a field element.
88    pub size_as_field_element: F,
89    /// Inverse of the size in the field.
90    pub size_inv: F,
91    /// A generator of the subgroup.
92    pub group_gen: F,
93    /// Inverse of the generator of the subgroup.
94    pub group_gen_inv: F,
95    /// Inverse of the multiplicative generator of the finite field.
96    pub generator_inv: F,
97}
98
99impl<F: FftField> fmt::Debug for EvaluationDomain<F> {
100    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
101        write!(f, "Multiplicative subgroup of size {}", self.size)
102    }
103}
104
105impl<F: FftField> EvaluationDomain<F> {
106    /// Sample an element that is *not* in the domain.
107    pub fn sample_element_outside_domain<R: Rng>(&self, rng: &mut R) -> F {
108        let mut t = F::rand(rng);
109        while self.evaluate_vanishing_polynomial(t).is_zero() {
110            t = F::rand(rng);
111        }
112        t
113    }
114
115    /// Construct a domain that is large enough for evaluations of a polynomial
116    /// having `num_coeffs` coefficients.
117    pub fn new(num_coeffs: usize) -> Option<Self> {
118        // Compute the size of our evaluation domain
119        let size = num_coeffs.checked_next_power_of_two()? as u64;
120        let log_size_of_group = size.trailing_zeros();
121
122        // libfqfft uses > https://github.com/scipr-lab/libfqfft/blob/e0183b2cef7d4c5deb21a6eaf3fe3b586d738fe0/libfqfft/evaluation_domain/domains/basic_radix2_domain.tcc#L33
123        if log_size_of_group > F::FftParameters::TWO_ADICITY {
124            return None;
125        }
126
127        // Compute the generator for the multiplicative subgroup.
128        // It should be the 2^(log_size_of_group) root of unity.
129        let group_gen = F::get_root_of_unity(size as usize)?;
130
131        // Check that it is indeed the 2^(log_size_of_group) root of unity.
132        debug_assert_eq!(group_gen.pow([size]), F::one());
133
134        let size_as_field_element = F::from(size);
135        let size_inv = size_as_field_element.inverse()?;
136
137        Some(EvaluationDomain {
138            size,
139            log_size_of_group,
140            size_as_field_element,
141            size_inv,
142            group_gen,
143            group_gen_inv: group_gen.inverse()?,
144            generator_inv: F::multiplicative_generator().inverse()?,
145        })
146    }
147
148    /// Return the size of a domain that is large enough for evaluations of a
149    /// polynomial having `num_coeffs` coefficients.
150    pub fn compute_size_of_domain(num_coeffs: usize) -> Option<usize> {
151        let size = num_coeffs.checked_next_power_of_two()?;
152        if size.trailing_zeros() <= F::FftParameters::TWO_ADICITY { Some(size) } else { None }
153    }
154
155    /// Return the size of `self`.
156    pub fn size(&self) -> usize {
157        self.size as usize
158    }
159
160    /// Compute an FFT.
161    pub fn fft<T: DomainCoeff<F>>(&self, coeffs: &[T]) -> Vec<T> {
162        let mut coeffs = coeffs.to_vec();
163        self.fft_in_place(&mut coeffs);
164        coeffs
165    }
166
167    /// Compute an FFT, modifying the vector in place.
168    pub fn fft_in_place<T: DomainCoeff<F>>(&self, coeffs: &mut Vec<T>) {
169        execute_with_max_available_threads(|| {
170            coeffs.resize(self.size(), T::zero());
171            self.in_order_fft_in_place(&mut *coeffs);
172        });
173    }
174
175    /// Compute an IFFT.
176    pub fn ifft<T: DomainCoeff<F>>(&self, evals: &[T]) -> Vec<T> {
177        let mut evals = evals.to_vec();
178        self.ifft_in_place(&mut evals);
179        evals
180    }
181
182    /// Compute an IFFT, modifying the vector in place.
183    #[inline]
184    pub fn ifft_in_place<T: DomainCoeff<F>>(&self, evals: &mut Vec<T>) {
185        execute_with_max_available_threads(|| {
186            evals.resize(self.size(), T::zero());
187            self.in_order_ifft_in_place(&mut *evals);
188        });
189    }
190
191    /// Compute an FFT over a coset of the domain.
192    pub fn coset_fft<T: DomainCoeff<F>>(&self, coeffs: &[T]) -> Vec<T> {
193        let mut coeffs = coeffs.to_vec();
194        self.coset_fft_in_place(&mut coeffs);
195        coeffs
196    }
197
198    /// Compute an FFT over a coset of the domain, modifying the input vector
199    /// in place.
200    pub fn coset_fft_in_place<T: DomainCoeff<F>>(&self, coeffs: &mut Vec<T>) {
201        execute_with_max_available_threads(|| {
202            Self::distribute_powers(coeffs, F::multiplicative_generator());
203            self.fft_in_place(coeffs);
204        });
205    }
206
207    /// Compute an IFFT over a coset of the domain.
208    pub fn coset_ifft<T: DomainCoeff<F>>(&self, evals: &[T]) -> Vec<T> {
209        let mut evals = evals.to_vec();
210        self.coset_ifft_in_place(&mut evals);
211        evals
212    }
213
214    /// Compute an IFFT over a coset of the domain, modifying the input vector
215    /// in place.
216    pub fn coset_ifft_in_place<T: DomainCoeff<F>>(&self, evals: &mut Vec<T>) {
217        execute_with_max_available_threads(|| {
218            evals.resize(self.size(), T::zero());
219            self.in_order_coset_ifft_in_place(&mut *evals);
220        });
221    }
222
223    /// Multiply the `i`-th element of `coeffs` with `g^i`.
224    fn distribute_powers<T: DomainCoeff<F>>(coeffs: &mut [T], g: F) {
225        Self::distribute_powers_and_mul_by_const(coeffs, g, F::one());
226    }
227
228    /// Multiply the `i`-th element of `coeffs` with `c*g^i`.
229    #[cfg(feature = "serial")]
230    fn distribute_powers_and_mul_by_const<T: DomainCoeff<F>>(coeffs: &mut [T], g: F, c: F) {
231        // invariant: pow = c*g^i at the ith iteration of the loop
232        let mut pow = c;
233        coeffs.iter_mut().for_each(|coeff| {
234            *coeff *= pow;
235            pow *= &g
236        })
237    }
238
239    /// Multiply the `i`-th element of `coeffs` with `c*g^i`.
240    #[cfg(not(feature = "serial"))]
241    fn distribute_powers_and_mul_by_const<T: DomainCoeff<F>>(coeffs: &mut [T], g: F, c: F) {
242        let min_parallel_chunk_size = 1024;
243        let num_cpus_available = max_available_threads();
244        let num_elem_per_thread = core::cmp::max(coeffs.len() / num_cpus_available, min_parallel_chunk_size);
245
246        cfg_chunks_mut!(coeffs, num_elem_per_thread).enumerate().for_each(|(i, chunk)| {
247            let offset = c * g.pow([(i * num_elem_per_thread) as u64]);
248            let mut pow = offset;
249            chunk.iter_mut().for_each(|coeff| {
250                *coeff *= pow;
251                pow *= &g
252            })
253        });
254    }
255
256    /// Evaluate all the lagrange polynomials defined by this domain at the
257    /// point `tau`.
258    pub fn evaluate_all_lagrange_coefficients(&self, tau: F) -> Vec<F> {
259        // Evaluate all Lagrange polynomials
260        let size = self.size as usize;
261        let t_size = tau.pow([self.size]);
262        let one = F::one();
263        if t_size.is_one() {
264            let mut u = vec![F::zero(); size];
265            let mut omega_i = one;
266            for x in u.iter_mut().take(size) {
267                if omega_i == tau {
268                    *x = one;
269                    break;
270                }
271                omega_i *= &self.group_gen;
272            }
273            u
274        } else {
275            let mut l = (t_size - one) * self.size_inv;
276            let mut r = one;
277            let mut u = vec![F::zero(); size];
278            let mut ls = vec![F::zero(); size];
279            for i in 0..size {
280                u[i] = tau - r;
281                ls[i] = l;
282                l *= &self.group_gen;
283                r *= &self.group_gen;
284            }
285
286            batch_inversion(u.as_mut_slice());
287            cfg_iter_mut!(u, 1_000).zip_eq(ls).for_each(|(tau_minus_r, l)| {
288                *tau_minus_r = l * *tau_minus_r;
289            });
290            u
291        }
292    }
293
294    /// Return the sparse vanishing polynomial.
295    pub fn vanishing_polynomial(&self) -> SparsePolynomial<F> {
296        let coeffs = [(0, -F::one()), (self.size(), F::one())];
297        SparsePolynomial::from_coefficients(coeffs)
298    }
299
300    /// This evaluates the vanishing polynomial for this domain at tau.
301    /// For multiplicative subgroups, this polynomial is `z(X) = X^self.size -
302    /// 1`.
303    pub fn evaluate_vanishing_polynomial(&self, tau: F) -> F {
304        tau.pow([self.size]) - F::one()
305    }
306
307    /// Return an iterator over the elements of the domain.
308    pub fn elements(&self) -> Elements<F> {
309        Elements { cur_elem: F::one(), cur_pow: 0, domain: *self }
310    }
311
312    /// The target polynomial is the zero polynomial in our
313    /// evaluation domain, so we must perform division over
314    /// a coset.
315    pub fn divide_by_vanishing_poly_on_coset_in_place(&self, evals: &mut [F]) {
316        let i = self.evaluate_vanishing_polynomial(F::multiplicative_generator()).inverse().unwrap();
317
318        cfg_iter_mut!(evals).for_each(|eval| *eval *= &i);
319    }
320
321    /// Given an index in the `other` subdomain, return an index into this
322    /// domain `self` This assumes the `other`'s elements are also `self`'s
323    /// first elements
324    pub fn reindex_by_subdomain(&self, other: &Self, index: usize) -> Result<usize> {
325        ensure!(self.size() > other.size(), "other.size() must be smaller than self.size()");
326
327        // Let this subgroup be G, and the subgroup we're re-indexing by be S.
328        // Since its a subgroup, the 0th element of S is at index 0 in G, the first
329        // element of S is at index |G|/|S|, the second at 2*|G|/|S|, etc.
330        // Thus for an index i that corresponds to S, the index in G is i*|G|/|S|
331        let period = self.size() / other.size();
332        if index < other.size() {
333            Ok(index * period)
334        } else {
335            // Let i now be the index of this element in G \ S
336            // Let x be the number of elements in G \ S, for every element in S. Then x =
337            // (|G|/|S| - 1). At index i in G \ S, the number of elements in S
338            // that appear before the index in G to which i corresponds to, is
339            // floor(i / x) + 1. The +1 is because index 0 of G is S_0, so the
340            // position is offset by at least one. The floor(i / x) term is
341            // because after x elements in G \ S, there is one more element from S
342            // that will have appeared in G.
343            let i = index - other.size();
344            let x = period - 1;
345            Ok(i + (i / x) + 1)
346        }
347    }
348
349    /// Perform O(n) multiplication of two polynomials that are presented by
350    /// their evaluations in the domain.
351    /// Returns the evaluations of the product over the domain.
352    pub fn mul_polynomials_in_evaluation_domain(&self, self_evals: Vec<F>, other_evals: &[F]) -> Result<Vec<F>> {
353        let mut result = self_evals;
354
355        ensure!(result.len() == other_evals.len());
356        cfg_iter_mut!(result).zip_eq(other_evals).for_each(|(a, b)| *a *= b);
357
358        Ok(result)
359    }
360}
361
362impl<F: FftField> EvaluationDomain<F> {
363    pub fn precompute_fft(&self) -> FFTPrecomputation<F> {
364        execute_with_max_available_threads(|| FFTPrecomputation {
365            roots: self.roots_of_unity(self.group_gen),
366            domain: *self,
367        })
368    }
369
370    pub fn precompute_ifft(&self) -> IFFTPrecomputation<F> {
371        execute_with_max_available_threads(|| IFFTPrecomputation {
372            inverse_roots: self.roots_of_unity(self.group_gen_inv),
373            domain: *self,
374        })
375    }
376
377    pub(crate) fn in_order_fft_in_place<T: DomainCoeff<F>>(&self, x_s: &mut [T]) {
378        #[cfg(all(feature = "cuda", target_arch = "x86_64"))]
379        // SNP TODO: how to set threshold and check that the type is Fr
380        if self.size >= 32 && std::mem::size_of::<T>() == 32 {
381            let result = snarkvm_algorithms_cuda::NTT(
382                self.size as usize,
383                x_s,
384                snarkvm_algorithms_cuda::NTTInputOutputOrder::NN,
385                snarkvm_algorithms_cuda::NTTDirection::Forward,
386                snarkvm_algorithms_cuda::NTTType::Standard,
387            );
388            if result.is_ok() {
389                return;
390            }
391        }
392
393        let pc = self.precompute_fft();
394        self.fft_helper_in_place_with_pc(x_s, FFTOrder::II, &pc)
395    }
396
397    pub fn in_order_fft_with_pc<T: DomainCoeff<F>>(&self, x_s: &[T], pc: &FFTPrecomputation<F>) -> Vec<T> {
398        let mut x_s = x_s.to_vec();
399        if self.size() != x_s.len() {
400            x_s.extend(core::iter::repeat_n(T::zero(), self.size() - x_s.len()));
401        }
402        self.fft_helper_in_place_with_pc(&mut x_s, FFTOrder::II, pc);
403        x_s
404    }
405
406    pub(crate) fn in_order_ifft_in_place<T: DomainCoeff<F>>(&self, x_s: &mut [T]) {
407        #[cfg(all(feature = "cuda", target_arch = "x86_64"))]
408        // SNP TODO: how to set threshold
409        if self.size >= 32 && std::mem::size_of::<T>() == 32 {
410            let result = snarkvm_algorithms_cuda::NTT(
411                self.size as usize,
412                x_s,
413                snarkvm_algorithms_cuda::NTTInputOutputOrder::NN,
414                snarkvm_algorithms_cuda::NTTDirection::Inverse,
415                snarkvm_algorithms_cuda::NTTType::Standard,
416            );
417            if result.is_ok() {
418                return;
419            }
420        }
421
422        let pc = self.precompute_ifft();
423        self.ifft_helper_in_place_with_pc(x_s, FFTOrder::II, &pc);
424        cfg_iter_mut!(x_s, 1_000).for_each(|val| *val *= self.size_inv);
425    }
426
427    pub(crate) fn in_order_coset_ifft_in_place<T: DomainCoeff<F>>(&self, x_s: &mut [T]) {
428        #[cfg(all(feature = "cuda", target_arch = "x86_64"))]
429        // SNP TODO: how to set threshold
430        if self.size >= 32 && std::mem::size_of::<T>() == 32 {
431            let result = snarkvm_algorithms_cuda::NTT(
432                self.size as usize,
433                x_s,
434                snarkvm_algorithms_cuda::NTTInputOutputOrder::NN,
435                snarkvm_algorithms_cuda::NTTDirection::Inverse,
436                snarkvm_algorithms_cuda::NTTType::Coset,
437            );
438            if result.is_ok() {
439                return;
440            }
441        }
442
443        let pc = self.precompute_ifft();
444        self.ifft_helper_in_place_with_pc(x_s, FFTOrder::II, &pc);
445        let coset_shift = self.generator_inv;
446        Self::distribute_powers_and_mul_by_const(x_s, coset_shift, self.size_inv);
447    }
448
449    #[allow(unused)]
450    pub(crate) fn in_order_fft_in_place_with_pc<T: DomainCoeff<F>>(
451        &self,
452        x_s: &mut [T],
453        pre_comp: &FFTPrecomputation<F>,
454    ) {
455        #[cfg(all(feature = "cuda", target_arch = "x86_64"))]
456        // SNP TODO: how to set threshold
457        if self.size >= 32 && std::mem::size_of::<T>() == 32 {
458            let result = snarkvm_algorithms_cuda::NTT(
459                self.size as usize,
460                x_s,
461                snarkvm_algorithms_cuda::NTTInputOutputOrder::NN,
462                snarkvm_algorithms_cuda::NTTDirection::Forward,
463                snarkvm_algorithms_cuda::NTTType::Standard,
464            );
465            if result.is_ok() {
466                return;
467            }
468        }
469
470        self.fft_helper_in_place_with_pc(x_s, FFTOrder::II, pre_comp)
471    }
472
473    pub(crate) fn out_order_fft_in_place_with_pc<T: DomainCoeff<F>>(
474        &self,
475        x_s: &mut [T],
476        pre_comp: &FFTPrecomputation<F>,
477    ) {
478        self.fft_helper_in_place_with_pc(x_s, FFTOrder::IO, pre_comp)
479    }
480
481    pub(crate) fn in_order_ifft_in_place_with_pc<T: DomainCoeff<F>>(
482        &self,
483        x_s: &mut [T],
484        pre_comp: &IFFTPrecomputation<F>,
485    ) {
486        #[cfg(all(feature = "cuda", target_arch = "x86_64"))]
487        // SNP TODO: how to set threshold
488        if self.size >= 32 && std::mem::size_of::<T>() == 32 {
489            let result = snarkvm_algorithms_cuda::NTT(
490                self.size as usize,
491                x_s,
492                snarkvm_algorithms_cuda::NTTInputOutputOrder::NN,
493                snarkvm_algorithms_cuda::NTTDirection::Inverse,
494                snarkvm_algorithms_cuda::NTTType::Standard,
495            );
496            if result.is_ok() {
497                return;
498            }
499        }
500
501        self.ifft_helper_in_place_with_pc(x_s, FFTOrder::II, pre_comp);
502        cfg_iter_mut!(x_s, 1_000).for_each(|val| *val *= self.size_inv);
503    }
504
505    pub(crate) fn out_order_ifft_in_place_with_pc<T: DomainCoeff<F>>(
506        &self,
507        x_s: &mut [T],
508        pre_comp: &IFFTPrecomputation<F>,
509    ) {
510        self.ifft_helper_in_place_with_pc(x_s, FFTOrder::OI, pre_comp);
511        cfg_iter_mut!(x_s, 1_000).for_each(|val| *val *= self.size_inv);
512    }
513
514    #[allow(unused)]
515    pub(crate) fn in_order_coset_ifft_in_place_with_pc<T: DomainCoeff<F>>(
516        &self,
517        x_s: &mut [T],
518        pre_comp: &IFFTPrecomputation<F>,
519    ) {
520        #[cfg(all(feature = "cuda", target_arch = "x86_64"))]
521        // SNP TODO: how to set threshold
522        if self.size >= 32 && std::mem::size_of::<T>() == 32 {
523            let result = snarkvm_algorithms_cuda::NTT(
524                self.size as usize,
525                x_s,
526                snarkvm_algorithms_cuda::NTTInputOutputOrder::NN,
527                snarkvm_algorithms_cuda::NTTDirection::Inverse,
528                snarkvm_algorithms_cuda::NTTType::Coset,
529            );
530            if result.is_ok() {
531                return;
532            }
533        }
534
535        self.ifft_helper_in_place_with_pc(x_s, FFTOrder::II, pre_comp);
536        let coset_shift = self.generator_inv;
537        Self::distribute_powers_and_mul_by_const(x_s, coset_shift, self.size_inv);
538    }
539
540    fn fft_helper_in_place_with_pc<T: DomainCoeff<F>>(
541        &self,
542        x_s: &mut [T],
543        ord: FFTOrder,
544        pre_comp: &FFTPrecomputation<F>,
545    ) {
546        use FFTOrder::*;
547        let pc = pre_comp.precomputation_for_subdomain(self).unwrap();
548
549        let log_len = log2(x_s.len());
550
551        if ord == OI {
552            self.oi_helper_with_roots(x_s, &pc.roots);
553        } else {
554            self.io_helper_with_roots(x_s, &pc.roots);
555        }
556
557        if ord == II {
558            derange_helper(x_s, log_len);
559        }
560    }
561
562    // Handles doing an IFFT with handling of being in order and out of order.
563    // The results here must all be divided by |x_s|,
564    // which is left up to the caller to do.
565    fn ifft_helper_in_place_with_pc<T: DomainCoeff<F>>(
566        &self,
567        x_s: &mut [T],
568        ord: FFTOrder,
569        pre_comp: &IFFTPrecomputation<F>,
570    ) {
571        use FFTOrder::*;
572        let pc = pre_comp.precomputation_for_subdomain(self).unwrap();
573
574        let log_len = log2(x_s.len());
575
576        if ord == II {
577            derange_helper(x_s, log_len);
578        }
579
580        if ord == IO {
581            self.io_helper_with_roots(x_s, &pc.inverse_roots);
582        } else {
583            self.oi_helper_with_roots(x_s, &pc.inverse_roots);
584        }
585    }
586
587    /// Computes the first `self.size / 2` roots of unity for the entire domain.
588    /// e.g. for the domain [1, g, g^2, ..., g^{n - 1}], it computes
589    // [1, g, g^2, ..., g^{(n/2) - 1}]
590    #[cfg(feature = "serial")]
591    pub fn roots_of_unity(&self, root: F) -> Vec<F> {
592        compute_powers_serial((self.size as usize) / 2, root)
593    }
594
595    /// Computes the first `self.size / 2` roots of unity.
596    #[cfg(not(feature = "serial"))]
597    pub fn roots_of_unity(&self, root: F) -> Vec<F> {
598        // TODO: check if this method can replace parallel compute powers.
599        let log_size = log2(self.size as usize);
600        // early exit for short inputs
601        if log_size <= LOG_ROOTS_OF_UNITY_PARALLEL_SIZE {
602            compute_powers_serial((self.size as usize) / 2, root)
603        } else {
604            let mut temp = root;
605            // w, w^2, w^4, w^8, ..., w^(2^(log_size - 1))
606            let log_powers: Vec<F> = (0..(log_size - 1))
607                .map(|_| {
608                    let old_value = temp;
609                    temp.square_in_place();
610                    old_value
611                })
612                .collect();
613
614            // allocate the return array and start the recursion
615            let mut powers = vec![F::zero(); 1 << (log_size - 1)];
616            Self::roots_of_unity_recursive(&mut powers, &log_powers);
617            powers
618        }
619    }
620
621    #[cfg(not(feature = "serial"))]
622    fn roots_of_unity_recursive(out: &mut [F], log_powers: &[F]) {
623        assert_eq!(out.len(), 1 << log_powers.len());
624        // base case: just compute the powers sequentially,
625        // g = log_powers[0], out = [1, g, g^2, ...]
626        if log_powers.len() <= LOG_ROOTS_OF_UNITY_PARALLEL_SIZE as usize {
627            out[0] = F::one();
628            for idx in 1..out.len() {
629                out[idx] = out[idx - 1] * log_powers[0];
630            }
631            return;
632        }
633
634        // recursive case:
635        // 1. split log_powers in half
636        let (lr_lo, lr_hi) = log_powers.split_at(log_powers.len().div_ceil(2));
637        let mut scr_lo = vec![F::default(); 1 << lr_lo.len()];
638        let mut scr_hi = vec![F::default(); 1 << lr_hi.len()];
639        // 2. compute each half individually
640        rayon::join(
641            || Self::roots_of_unity_recursive(&mut scr_lo, lr_lo),
642            || Self::roots_of_unity_recursive(&mut scr_hi, lr_hi),
643        );
644        // 3. recombine halves
645        // At this point, out is a blank slice.
646        out.par_chunks_mut(scr_lo.len()).zip(&scr_hi).for_each(|(out_chunk, scr_hi)| {
647            for (out_elem, scr_lo) in out_chunk.iter_mut().zip(&scr_lo) {
648                *out_elem = *scr_hi * scr_lo;
649            }
650        });
651    }
652
653    #[inline(always)]
654    fn butterfly_fn_io<T: DomainCoeff<F>>(((lo, hi), root): ((&mut T, &mut T), &F)) {
655        let neg = *lo - *hi;
656        *lo += *hi;
657        *hi = neg;
658        *hi *= *root;
659    }
660
661    #[inline(always)]
662    fn butterfly_fn_oi<T: DomainCoeff<F>>(((lo, hi), root): ((&mut T, &mut T), &F)) {
663        *hi *= *root;
664        let neg = *lo - *hi;
665        *lo += *hi;
666        *hi = neg;
667    }
668
669    #[allow(clippy::too_many_arguments)]
670    fn apply_butterfly<T: DomainCoeff<F>, G: Fn(((&mut T, &mut T), &F)) + Copy + Sync + Send>(
671        g: G,
672        xi: &mut [T],
673        roots: &[F],
674        step: usize,
675        chunk_size: usize,
676        num_chunks: usize,
677        max_threads: usize,
678        gap: usize,
679    ) {
680        cfg_chunks_mut!(xi, chunk_size).for_each(|cxi| {
681            let (lo, hi) = cxi.split_at_mut(gap);
682            // If the chunk is sufficiently big that parallelism helps,
683            // we parallelize the butterfly operation within the chunk.
684
685            if gap > MIN_GAP_SIZE_FOR_PARALLELISATION && num_chunks < max_threads {
686                cfg_iter_mut!(lo, 1000).zip(hi).zip(cfg_iter!(roots, 1000).step_by(step)).for_each(g);
687            } else {
688                lo.iter_mut().zip(hi).zip(roots.iter().step_by(step)).for_each(g);
689            }
690        });
691    }
692
693    #[allow(clippy::unnecessary_to_owned)]
694    fn io_helper_with_roots<T: DomainCoeff<F>>(&self, xi: &mut [T], roots: &[F]) {
695        let mut roots = std::borrow::Cow::Borrowed(roots);
696
697        let mut step = 1;
698        let mut first = true;
699
700        #[cfg(not(feature = "serial"))]
701        let max_threads = snarkvm_utilities::parallel::max_available_threads();
702        #[cfg(feature = "serial")]
703        let max_threads = 1;
704
705        let mut gap = xi.len() / 2;
706        while gap > 0 {
707            // each butterfly cluster uses 2*gap positions
708            let chunk_size = 2 * gap;
709            let num_chunks = xi.len() / chunk_size;
710
711            // Only compact roots to achieve cache locality/compactness if
712            // the roots lookup is done a significant amount of times
713            // Which also implies a large lookup stride.
714            if num_chunks >= MIN_NUM_CHUNKS_FOR_COMPACTION {
715                if !first {
716                    roots = Cow::Owned(cfg_into_iter!(roots.into_owned()).step_by(step * 2).collect());
717                }
718                step = 1;
719                roots.to_mut().shrink_to_fit();
720            } else {
721                step = num_chunks;
722            }
723            first = false;
724
725            Self::apply_butterfly(
726                Self::butterfly_fn_io,
727                xi,
728                &roots[..],
729                step,
730                chunk_size,
731                num_chunks,
732                max_threads,
733                gap,
734            );
735
736            gap /= 2;
737        }
738    }
739
740    fn oi_helper_with_roots<T: DomainCoeff<F>>(&self, xi: &mut [T], roots_cache: &[F]) {
741        // The `cmp::min` is only necessary for the case where
742        // `MIN_NUM_CHUNKS_FOR_COMPACTION = 1`. Else, notice that we compact
743        // the roots cache by a stride of at least `MIN_NUM_CHUNKS_FOR_COMPACTION`.
744
745        let compaction_max_size =
746            core::cmp::min(roots_cache.len() / 2, roots_cache.len() / MIN_NUM_CHUNKS_FOR_COMPACTION);
747        let mut compacted_roots = vec![F::default(); compaction_max_size];
748
749        #[cfg(not(feature = "serial"))]
750        let max_threads = snarkvm_utilities::parallel::max_available_threads();
751        #[cfg(feature = "serial")]
752        let max_threads = 1;
753
754        let mut gap = 1;
755        while gap < xi.len() {
756            // each butterfly cluster uses 2*gap positions
757            let chunk_size = 2 * gap;
758            let num_chunks = xi.len() / chunk_size;
759
760            // Only compact roots to achieve cache locality/compactness if
761            // the roots lookup is done a significant amount of times
762            // Which also implies a large lookup stride.
763            let (roots, step) = if num_chunks >= MIN_NUM_CHUNKS_FOR_COMPACTION && gap < xi.len() / 2 {
764                cfg_iter_mut!(compacted_roots[..gap], 1_000)
765                    .zip(cfg_iter!(roots_cache[..(gap * num_chunks)], 1_000).step_by(num_chunks))
766                    .for_each(|(a, b)| *a = *b);
767                (&compacted_roots[..gap], 1)
768            } else {
769                (roots_cache, num_chunks)
770            };
771
772            Self::apply_butterfly(Self::butterfly_fn_oi, xi, roots, step, chunk_size, num_chunks, max_threads, gap);
773
774            gap *= 2;
775        }
776    }
777}
778
779/// The minimum number of chunks at which root compaction
780/// is beneficial.
781const MIN_NUM_CHUNKS_FOR_COMPACTION: usize = 1 << 7;
782
783/// The minimum size of a chunk at which parallelization of `butterfly`s is
784/// beneficial. This value was chosen empirically.
785const MIN_GAP_SIZE_FOR_PARALLELISATION: usize = 1 << 10;
786
787// minimum size at which to parallelize.
788#[cfg(not(feature = "serial"))]
789const LOG_ROOTS_OF_UNITY_PARALLEL_SIZE: u32 = 7;
790
791#[inline]
792pub(super) fn bitrev(a: u64, log_len: u32) -> u64 {
793    a.reverse_bits() >> (64 - log_len)
794}
795
796pub(crate) fn derange<T>(xi: &mut [T]) {
797    derange_helper(xi, log2(xi.len()))
798}
799
800fn derange_helper<T>(xi: &mut [T], log_len: u32) {
801    for idx in 1..(xi.len() as u64 - 1) {
802        let ridx = bitrev(idx, log_len);
803        if idx < ridx {
804            xi.swap(idx as usize, ridx as usize);
805        }
806    }
807}
808
809#[derive(PartialEq, Eq, Debug)]
810enum FFTOrder {
811    /// Both the input and the output of the FFT must be in-order.
812    II,
813    /// The input of the FFT must be in-order, but the output does not have to
814    /// be.
815    IO,
816    /// The input of the FFT can be out of order, but the output must be
817    /// in-order.
818    OI,
819}
820
821pub(crate) fn compute_powers_serial<F: Field>(size: usize, root: F) -> Vec<F> {
822    compute_powers_and_mul_by_const_serial(size, root, F::one())
823}
824
825pub(crate) fn compute_powers_and_mul_by_const_serial<F: Field>(size: usize, root: F, c: F) -> Vec<F> {
826    let mut value = c;
827    (0..size)
828        .map(|_| {
829            let old_value = value;
830            value *= root;
831            old_value
832        })
833        .collect()
834}
835
836#[allow(unused)]
837#[cfg(not(feature = "serial"))]
838pub(crate) fn compute_powers<F: Field>(size: usize, g: F) -> Vec<F> {
839    if size < MIN_PARALLEL_CHUNK_SIZE {
840        return compute_powers_serial(size, g);
841    }
842    // compute the number of threads we will be using.
843    let num_cpus_available = max_available_threads();
844    let num_elem_per_thread = core::cmp::max(size / num_cpus_available, MIN_PARALLEL_CHUNK_SIZE);
845    let num_cpus_used = size / num_elem_per_thread;
846
847    // Split up the powers to compute across each thread evenly.
848    let res: Vec<F> = (0..num_cpus_used)
849        .into_par_iter()
850        .flat_map(|i| {
851            let offset = g.pow([(i * num_elem_per_thread) as u64]);
852            // Compute the size that this chunks' output should be
853            // (num_elem_per_thread, unless there are less than num_elem_per_thread elements
854            // remaining)
855            let num_elements_to_compute = core::cmp::min(size - i * num_elem_per_thread, num_elem_per_thread);
856            compute_powers_and_mul_by_const_serial(num_elements_to_compute, g, offset)
857        })
858        .collect();
859    res
860}
861
862/// An iterator over the elements of the domain.
863#[derive(Clone)]
864pub struct Elements<F: FftField> {
865    cur_elem: F,
866    cur_pow: u64,
867    domain: EvaluationDomain<F>,
868}
869
870impl<F: FftField> Iterator for Elements<F> {
871    type Item = F;
872
873    fn next(&mut self) -> Option<F> {
874        if self.cur_pow == self.domain.size {
875            None
876        } else {
877            let cur_elem = self.cur_elem;
878            self.cur_elem *= &self.domain.group_gen;
879            self.cur_pow += 1;
880            Some(cur_elem)
881        }
882    }
883}
884
885/// An iterator over the elements of the domain.
886#[derive(Clone, Eq, PartialEq, Debug, CanonicalDeserialize, CanonicalSerialize)]
887pub struct FFTPrecomputation<F: FftField> {
888    roots: Vec<F>,
889    domain: EvaluationDomain<F>,
890}
891
892impl<F: FftField> FFTPrecomputation<F> {
893    pub fn to_ifft_precomputation(&self) -> IFFTPrecomputation<F> {
894        let mut inverse_roots = self.roots.clone();
895        snarkvm_fields::batch_inversion(&mut inverse_roots);
896        IFFTPrecomputation { inverse_roots, domain: self.domain }
897    }
898
899    pub fn precomputation_for_subdomain<'a>(&'a self, domain: &EvaluationDomain<F>) -> Option<Cow<'a, Self>> {
900        if domain.size() == 1 {
901            return Some(Cow::Owned(Self { roots: vec![], domain: *domain }));
902        }
903        if &self.domain == domain {
904            Some(Cow::Borrowed(self))
905        } else if domain.size() < self.domain.size() {
906            let size_ratio = self.domain.size() / domain.size();
907            let roots = self.roots.iter().step_by(size_ratio).copied().collect();
908            Some(Cow::Owned(Self { roots, domain: *domain }))
909        } else {
910            None
911        }
912    }
913}
914
915/// An iterator over the elements of the domain.
916#[derive(Clone, Eq, PartialEq, Debug, CanonicalSerialize, CanonicalDeserialize)]
917pub struct IFFTPrecomputation<F: FftField> {
918    inverse_roots: Vec<F>,
919    domain: EvaluationDomain<F>,
920}
921
922impl<F: FftField> IFFTPrecomputation<F> {
923    pub fn precomputation_for_subdomain<'a>(&'a self, domain: &EvaluationDomain<F>) -> Option<Cow<'a, Self>> {
924        if domain.size() == 1 {
925            return Some(Cow::Owned(Self { inverse_roots: vec![], domain: *domain }));
926        }
927        if &self.domain == domain {
928            Some(Cow::Borrowed(self))
929        } else if domain.size() < self.domain.size() {
930            let size_ratio = self.domain.size() / domain.size();
931            let inverse_roots = self.inverse_roots.iter().step_by(size_ratio).copied().collect();
932            Some(Cow::Owned(Self { inverse_roots, domain: *domain }))
933        } else {
934            None
935        }
936    }
937}
938
939#[cfg(test)]
940mod tests {
941    #[cfg(all(feature = "cuda", target_arch = "x86_64"))]
942    use crate::fft::domain::FFTOrder;
943    use crate::fft::{DensePolynomial, EvaluationDomain};
944    use rand::Rng;
945    use snarkvm_curves::bls12_377::Fr;
946    use snarkvm_fields::{FftField, Field, One, Zero};
947    use snarkvm_utilities::{TestRng, Uniform};
948
949    #[test]
950    fn vanishing_polynomial_evaluation() {
951        let rng = &mut TestRng::default();
952        for coeffs in 0..10 {
953            let domain = EvaluationDomain::<Fr>::new(coeffs).unwrap();
954            let z = domain.vanishing_polynomial();
955            for _ in 0..100 {
956                let point = rng.r#gen();
957                assert_eq!(z.evaluate(point), domain.evaluate_vanishing_polynomial(point))
958            }
959        }
960    }
961
962    #[test]
963    fn vanishing_polynomial_vanishes_on_domain() {
964        for coeffs in 0..1000 {
965            let domain = EvaluationDomain::<Fr>::new(coeffs).unwrap();
966            let z = domain.vanishing_polynomial();
967            for point in domain.elements() {
968                assert!(z.evaluate(point).is_zero())
969            }
970        }
971    }
972
973    #[test]
974    fn size_of_elements() {
975        for coeffs in 1..10 {
976            let size = 1 << coeffs;
977            let domain = EvaluationDomain::<Fr>::new(size).unwrap();
978            let domain_size = domain.size();
979            assert_eq!(domain_size, domain.elements().count());
980        }
981    }
982
983    #[test]
984    fn elements_contents() {
985        for coeffs in 1..10 {
986            let size = 1 << coeffs;
987            let domain = EvaluationDomain::<Fr>::new(size).unwrap();
988            for (i, element) in domain.elements().enumerate() {
989                assert_eq!(element, domain.group_gen.pow([i as u64]));
990            }
991        }
992    }
993
994    /// Test that lagrange interpolation for a random polynomial at a random
995    /// point works.
996    #[test]
997    fn non_systematic_lagrange_coefficients_test() {
998        let mut rng = TestRng::default();
999        for domain_dimension in 1..10 {
1000            let domain_size = 1 << domain_dimension;
1001            let domain = EvaluationDomain::<Fr>::new(domain_size).unwrap();
1002            // Get random point & lagrange coefficients
1003            let random_point = Fr::rand(&mut rng);
1004            let lagrange_coefficients = domain.evaluate_all_lagrange_coefficients(random_point);
1005
1006            // Sample the random polynomial, evaluate it over the domain and the random
1007            // point.
1008            let random_polynomial = DensePolynomial::<Fr>::rand(domain_size - 1, &mut rng);
1009            let polynomial_evaluations = domain.fft(random_polynomial.coeffs());
1010            let actual_evaluations = random_polynomial.evaluate(random_point);
1011
1012            // Do lagrange interpolation, and compare against the actual evaluation
1013            let mut interpolated_evaluation = Fr::zero();
1014            for i in 0..domain_size {
1015                interpolated_evaluation += lagrange_coefficients[i] * polynomial_evaluations[i];
1016            }
1017            assert_eq!(actual_evaluations, interpolated_evaluation);
1018        }
1019    }
1020
1021    /// Test that lagrange coefficients for a point in the domain is correct.
1022    #[test]
1023    fn systematic_lagrange_coefficients_test() {
1024        // This runs in time O(N^2) in the domain size, so keep the domain dimension
1025        // low. We generate lagrange coefficients for each element in the
1026        // domain.
1027        for domain_dimension in 1..5 {
1028            let domain_size = 1 << domain_dimension;
1029            let domain = EvaluationDomain::<Fr>::new(domain_size).unwrap();
1030            let all_domain_elements: Vec<Fr> = domain.elements().collect();
1031            for (i, domain_element) in all_domain_elements.iter().enumerate().take(domain_size) {
1032                let lagrange_coefficients = domain.evaluate_all_lagrange_coefficients(*domain_element);
1033                for (j, lagrange_coefficient) in lagrange_coefficients.iter().enumerate().take(domain_size) {
1034                    // Lagrange coefficient for the evaluation point, which should be 1
1035                    if i == j {
1036                        assert_eq!(*lagrange_coefficient, Fr::one());
1037                    } else {
1038                        assert_eq!(*lagrange_coefficient, Fr::zero());
1039                    }
1040                }
1041            }
1042        }
1043    }
1044
1045    /// Tests that the roots of unity result is the same as domain.elements().
1046    #[test]
1047    fn test_roots_of_unity() {
1048        let max_degree = 10;
1049        for log_domain_size in 0..max_degree {
1050            let domain_size = 1 << log_domain_size;
1051            let domain = EvaluationDomain::<Fr>::new(domain_size).unwrap();
1052            let actual_roots = domain.roots_of_unity(domain.group_gen);
1053            for &value in &actual_roots {
1054                assert!(domain.evaluate_vanishing_polynomial(value).is_zero());
1055            }
1056            let expected_roots_elements = domain.elements();
1057            for (expected, &actual) in expected_roots_elements.zip(&actual_roots) {
1058                assert_eq!(expected, actual);
1059            }
1060            assert_eq!(actual_roots.len(), domain_size / 2);
1061        }
1062    }
1063
1064    /// Tests that the FFTs output the correct result.
1065    #[test]
1066    fn test_fft_correctness() {
1067        // This assumes a correct polynomial evaluation at point procedure.
1068        // It tests consistency of FFT/IFFT, and coset_fft/coset_ifft,
1069        // along with testing that each individual evaluation is correct.
1070
1071        let mut rng = TestRng::default();
1072
1073        // Runs in time O(degree^2)
1074        let log_degree = 5;
1075        let degree = 1 << log_degree;
1076        let random_polynomial = DensePolynomial::<Fr>::rand(degree - 1, &mut rng);
1077
1078        for log_domain_size in log_degree..(log_degree + 2) {
1079            let domain_size = 1 << log_domain_size;
1080            let domain = EvaluationDomain::<Fr>::new(domain_size).unwrap();
1081            let polynomial_evaluations = domain.fft(&random_polynomial.coeffs);
1082            let polynomial_coset_evaluations = domain.coset_fft(&random_polynomial.coeffs);
1083            for (i, x) in domain.elements().enumerate() {
1084                let coset_x = Fr::multiplicative_generator() * x;
1085
1086                assert_eq!(polynomial_evaluations[i], random_polynomial.evaluate(x));
1087                assert_eq!(polynomial_coset_evaluations[i], random_polynomial.evaluate(coset_x));
1088            }
1089
1090            let randon_polynomial_from_subgroup =
1091                DensePolynomial::from_coefficients_vec(domain.ifft(&polynomial_evaluations));
1092            let random_polynomial_from_coset =
1093                DensePolynomial::from_coefficients_vec(domain.coset_ifft(&polynomial_coset_evaluations));
1094
1095            assert_eq!(
1096                random_polynomial, randon_polynomial_from_subgroup,
1097                "degree = {degree}, domain size = {domain_size}"
1098            );
1099            assert_eq!(
1100                random_polynomial, random_polynomial_from_coset,
1101                "degree = {degree}, domain size = {domain_size}"
1102            );
1103        }
1104    }
1105
1106    /// Tests that FFT precomputation is correctly subdomained
1107    #[test]
1108    fn test_fft_precomputation() {
1109        for i in 1..10 {
1110            let big_domain = EvaluationDomain::<Fr>::new(i).unwrap();
1111            let pc = big_domain.precompute_fft();
1112            for j in 1..i {
1113                let small_domain = EvaluationDomain::<Fr>::new(j).unwrap();
1114                let small_pc = small_domain.precompute_fft();
1115                assert_eq!(pc.precomputation_for_subdomain(&small_domain).unwrap().as_ref(), &small_pc);
1116            }
1117        }
1118    }
1119
1120    /// Tests that IFFT precomputation is correctly subdomained
1121    #[test]
1122    fn test_ifft_precomputation() {
1123        for i in 1..10 {
1124            let big_domain = EvaluationDomain::<Fr>::new(i).unwrap();
1125            let pc = big_domain.precompute_ifft();
1126            for j in 1..i {
1127                let small_domain = EvaluationDomain::<Fr>::new(j).unwrap();
1128                let small_pc = small_domain.precompute_ifft();
1129                assert_eq!(pc.precomputation_for_subdomain(&small_domain).unwrap().as_ref(), &small_pc);
1130            }
1131        }
1132    }
1133
1134    /// Tests that IFFT precomputation can be correctly computed from
1135    /// FFT precomputation
1136    #[test]
1137    fn test_ifft_precomputation_from_fft() {
1138        for i in 1..10 {
1139            let domain = EvaluationDomain::<Fr>::new(i).unwrap();
1140            let pc = domain.precompute_ifft();
1141            let fft_pc = domain.precompute_fft();
1142            assert_eq!(pc, fft_pc.to_ifft_precomputation())
1143        }
1144    }
1145
1146    /// Tests that the FFTs output the correct result.
1147    #[cfg(all(feature = "cuda", target_arch = "x86_64"))]
1148    #[test]
1149    fn test_fft_correctness_cuda() {
1150        let mut rng = TestRng::default();
1151        for log_domain in 2..20 {
1152            println!("Testing domain size {log_domain}");
1153            let domain_size = 1 << log_domain;
1154            let random_polynomial = DensePolynomial::<Fr>::rand(domain_size - 1, &mut rng);
1155            let mut polynomial_evaluations = random_polynomial.coeffs.clone();
1156            let mut polynomial_evaluations_cuda = random_polynomial.coeffs.clone();
1157
1158            let domain = EvaluationDomain::<Fr>::new(domain_size).unwrap();
1159            let pc = domain.precompute_fft();
1160            domain.fft_helper_in_place_with_pc(&mut polynomial_evaluations, FFTOrder::II, &pc);
1161
1162            if snarkvm_algorithms_cuda::NTT::<Fr>(
1163                domain_size,
1164                &mut polynomial_evaluations_cuda,
1165                snarkvm_algorithms_cuda::NTTInputOutputOrder::NN,
1166                snarkvm_algorithms_cuda::NTTDirection::Forward,
1167                snarkvm_algorithms_cuda::NTTType::Standard,
1168            )
1169            .is_err()
1170            {
1171                println!("cuda error!");
1172            }
1173
1174            assert_eq!(polynomial_evaluations, polynomial_evaluations_cuda, "domain size = {domain_size}");
1175
1176            // iNTT
1177            if snarkvm_algorithms_cuda::NTT::<Fr>(
1178                domain_size,
1179                &mut polynomial_evaluations_cuda,
1180                snarkvm_algorithms_cuda::NTTInputOutputOrder::NN,
1181                snarkvm_algorithms_cuda::NTTDirection::Inverse,
1182                snarkvm_algorithms_cuda::NTTType::Standard,
1183            )
1184            .is_err()
1185            {
1186                println!("cuda error!");
1187            }
1188            assert_eq!(random_polynomial.coeffs, polynomial_evaluations_cuda, "domain size = {domain_size}");
1189
1190            // Coset NTT
1191            polynomial_evaluations = random_polynomial.coeffs.clone();
1192            let domain = EvaluationDomain::<Fr>::new(domain_size).unwrap();
1193            let pc = domain.precompute_fft();
1194            EvaluationDomain::<Fr>::distribute_powers(&mut polynomial_evaluations, Fr::multiplicative_generator());
1195            domain.fft_helper_in_place_with_pc(&mut polynomial_evaluations, FFTOrder::II, &pc);
1196
1197            if snarkvm_algorithms_cuda::NTT::<Fr>(
1198                domain_size,
1199                &mut polynomial_evaluations_cuda,
1200                snarkvm_algorithms_cuda::NTTInputOutputOrder::NN,
1201                snarkvm_algorithms_cuda::NTTDirection::Forward,
1202                snarkvm_algorithms_cuda::NTTType::Coset,
1203            )
1204            .is_err()
1205            {
1206                println!("cuda error!");
1207            }
1208
1209            assert_eq!(polynomial_evaluations, polynomial_evaluations_cuda, "domain size = {domain_size}");
1210
1211            // Coset iNTT
1212            if snarkvm_algorithms_cuda::NTT::<Fr>(
1213                domain_size,
1214                &mut polynomial_evaluations_cuda,
1215                snarkvm_algorithms_cuda::NTTInputOutputOrder::NN,
1216                snarkvm_algorithms_cuda::NTTDirection::Inverse,
1217                snarkvm_algorithms_cuda::NTTType::Coset,
1218            )
1219            .is_err()
1220            {
1221                println!("cuda error!");
1222            }
1223            assert_eq!(random_polynomial.coeffs, polynomial_evaluations_cuda, "domain size = {domain_size}");
1224        }
1225    }
1226}