snarkvm_algorithms/fft/
domain.rs

1// Copyright (c) 2019-2025 Provable Inc.
2// This file is part of the snarkVM library.
3
4// Licensed under the Apache License, Version 2.0 (the "License");
5// you may not use this file except in compliance with the License.
6// You may obtain a copy of the License at:
7
8// http://www.apache.org/licenses/LICENSE-2.0
9
10// Unless required by applicable law or agreed to in writing, software
11// distributed under the License is distributed on an "AS IS" BASIS,
12// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13// See the License for the specific language governing permissions and
14// limitations under the License.
15
16//! This module contains an `EvaluationDomain` abstraction for
17//! performing various kinds of polynomial arithmetic on top of
18//! the scalar field.
19//!
20//! In pairing-based SNARKs like GM17, we need to calculate
21//! a quotient polynomial over a target polynomial with roots
22//! at distinct points associated with each constraint of the
23//! constraint system. In order to be efficient, we choose these
24//! roots to be the powers of a 2^n root of unity in the field.
25//! This allows us to perform polynomial operations in O(n)
26//! by performing an O(n log n) FFT over such a domain.
27
28use crate::{
29    cfg_chunks_mut,
30    cfg_into_iter,
31    cfg_iter,
32    cfg_iter_mut,
33    fft::{DomainCoeff, SparsePolynomial},
34};
35use snarkvm_fields::{FftField, FftParameters, Field, batch_inversion};
36#[cfg(not(feature = "serial"))]
37use snarkvm_utilities::max_available_threads;
38use snarkvm_utilities::{execute_with_max_available_threads, serialize::*};
39
40use rand::Rng;
41use std::{borrow::Cow, fmt};
42
43use anyhow::{Result, ensure};
44
45#[cfg(not(feature = "serial"))]
46use rayon::prelude::*;
47
48#[cfg(feature = "serial")]
49use itertools::Itertools;
50
51/// Returns the ceiling of the base-2 logarithm of `x`.
52///
53/// ```
54/// use snarkvm_algorithms::fft::domain::log2;
55///
56/// assert_eq!(log2(16), 4);
57/// assert_eq!(log2(17), 5);
58/// assert_eq!(log2(1), 0);
59/// assert_eq!(log2(0), 0);
60/// assert_eq!(log2(usize::MAX), (core::mem::size_of::<usize>() * 8) as u32);
61/// assert_eq!(log2(1 << 15), 15);
62/// assert_eq!(log2(2usize.pow(18)), 18);
63/// ```
64pub fn log2(x: usize) -> u32 {
65    if x == 0 {
66        0
67    } else if x.is_power_of_two() {
68        1usize.leading_zeros() - x.leading_zeros()
69    } else {
70        0usize.leading_zeros() - x.leading_zeros()
71    }
72}
73
74// minimum size of a parallelized chunk
75#[allow(unused)]
76#[cfg(not(feature = "serial"))]
77const MIN_PARALLEL_CHUNK_SIZE: usize = 1 << 7;
78
79/// Defines a domain over which finite field (I)FFTs can be performed. Works
80/// only for fields that have a large multiplicative subgroup of size that is
81/// a power-of-2.
82#[derive(Copy, Clone, Hash, Eq, PartialEq, CanonicalSerialize, CanonicalDeserialize)]
83pub struct EvaluationDomain<F: FftField> {
84    /// The size of the domain.
85    pub size: u64,
86    /// `log_2(self.size)`.
87    pub log_size_of_group: u32,
88    /// Size of the domain as a field element.
89    pub size_as_field_element: F,
90    /// Inverse of the size in the field.
91    pub size_inv: F,
92    /// A generator of the subgroup.
93    pub group_gen: F,
94    /// Inverse of the generator of the subgroup.
95    pub group_gen_inv: F,
96    /// Inverse of the multiplicative generator of the finite field.
97    pub generator_inv: F,
98}
99
100impl<F: FftField> fmt::Debug for EvaluationDomain<F> {
101    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
102        write!(f, "Multiplicative subgroup of size {}", self.size)
103    }
104}
105
106impl<F: FftField> EvaluationDomain<F> {
107    /// Sample an element that is *not* in the domain.
108    pub fn sample_element_outside_domain<R: Rng>(&self, rng: &mut R) -> F {
109        let mut t = F::rand(rng);
110        while self.evaluate_vanishing_polynomial(t).is_zero() {
111            t = F::rand(rng);
112        }
113        t
114    }
115
116    /// Construct a domain that is large enough for evaluations of a polynomial
117    /// having `num_coeffs` coefficients.
118    pub fn new(num_coeffs: usize) -> Option<Self> {
119        // Compute the size of our evaluation domain
120        let size = num_coeffs.checked_next_power_of_two()? as u64;
121        let log_size_of_group = size.trailing_zeros();
122
123        // libfqfft uses > https://github.com/scipr-lab/libfqfft/blob/e0183b2cef7d4c5deb21a6eaf3fe3b586d738fe0/libfqfft/evaluation_domain/domains/basic_radix2_domain.tcc#L33
124        if log_size_of_group > F::FftParameters::TWO_ADICITY {
125            return None;
126        }
127
128        // Compute the generator for the multiplicative subgroup.
129        // It should be the 2^(log_size_of_group) root of unity.
130        let group_gen = F::get_root_of_unity(size as usize)?;
131
132        // Check that it is indeed the 2^(log_size_of_group) root of unity.
133        debug_assert_eq!(group_gen.pow([size]), F::one());
134
135        let size_as_field_element = F::from(size);
136        let size_inv = size_as_field_element.inverse()?;
137
138        Some(EvaluationDomain {
139            size,
140            log_size_of_group,
141            size_as_field_element,
142            size_inv,
143            group_gen,
144            group_gen_inv: group_gen.inverse()?,
145            generator_inv: F::multiplicative_generator().inverse()?,
146        })
147    }
148
149    /// Return the size of a domain that is large enough for evaluations of a
150    /// polynomial having `num_coeffs` coefficients.
151    pub fn compute_size_of_domain(num_coeffs: usize) -> Option<usize> {
152        let size = num_coeffs.checked_next_power_of_two()?;
153        if size.trailing_zeros() <= F::FftParameters::TWO_ADICITY { Some(size) } else { None }
154    }
155
156    /// Return the size of `self`.
157    pub fn size(&self) -> usize {
158        self.size as usize
159    }
160
161    /// Compute an FFT.
162    pub fn fft<T: DomainCoeff<F>>(&self, coeffs: &[T]) -> Vec<T> {
163        let mut coeffs = coeffs.to_vec();
164        self.fft_in_place(&mut coeffs);
165        coeffs
166    }
167
168    /// Compute an FFT, modifying the vector in place.
169    pub fn fft_in_place<T: DomainCoeff<F>>(&self, coeffs: &mut Vec<T>) {
170        execute_with_max_available_threads(|| {
171            coeffs.resize(self.size(), T::zero());
172            self.in_order_fft_in_place(&mut *coeffs);
173        });
174    }
175
176    /// Compute an IFFT.
177    pub fn ifft<T: DomainCoeff<F>>(&self, evals: &[T]) -> Vec<T> {
178        let mut evals = evals.to_vec();
179        self.ifft_in_place(&mut evals);
180        evals
181    }
182
183    /// Compute an IFFT, modifying the vector in place.
184    #[inline]
185    pub fn ifft_in_place<T: DomainCoeff<F>>(&self, evals: &mut Vec<T>) {
186        execute_with_max_available_threads(|| {
187            evals.resize(self.size(), T::zero());
188            self.in_order_ifft_in_place(&mut *evals);
189        });
190    }
191
192    /// Compute an FFT over a coset of the domain.
193    pub fn coset_fft<T: DomainCoeff<F>>(&self, coeffs: &[T]) -> Vec<T> {
194        let mut coeffs = coeffs.to_vec();
195        self.coset_fft_in_place(&mut coeffs);
196        coeffs
197    }
198
199    /// Compute an FFT over a coset of the domain, modifying the input vector
200    /// in place.
201    pub fn coset_fft_in_place<T: DomainCoeff<F>>(&self, coeffs: &mut Vec<T>) {
202        execute_with_max_available_threads(|| {
203            Self::distribute_powers(coeffs, F::multiplicative_generator());
204            self.fft_in_place(coeffs);
205        });
206    }
207
208    /// Compute an IFFT over a coset of the domain.
209    pub fn coset_ifft<T: DomainCoeff<F>>(&self, evals: &[T]) -> Vec<T> {
210        let mut evals = evals.to_vec();
211        self.coset_ifft_in_place(&mut evals);
212        evals
213    }
214
215    /// Compute an IFFT over a coset of the domain, modifying the input vector
216    /// in place.
217    pub fn coset_ifft_in_place<T: DomainCoeff<F>>(&self, evals: &mut Vec<T>) {
218        execute_with_max_available_threads(|| {
219            evals.resize(self.size(), T::zero());
220            self.in_order_coset_ifft_in_place(&mut *evals);
221        });
222    }
223
224    /// Multiply the `i`-th element of `coeffs` with `g^i`.
225    fn distribute_powers<T: DomainCoeff<F>>(coeffs: &mut [T], g: F) {
226        Self::distribute_powers_and_mul_by_const(coeffs, g, F::one());
227    }
228
229    /// Multiply the `i`-th element of `coeffs` with `c*g^i`.
230    #[cfg(feature = "serial")]
231    fn distribute_powers_and_mul_by_const<T: DomainCoeff<F>>(coeffs: &mut [T], g: F, c: F) {
232        // invariant: pow = c*g^i at the ith iteration of the loop
233        let mut pow = c;
234        coeffs.iter_mut().for_each(|coeff| {
235            *coeff *= pow;
236            pow *= &g
237        })
238    }
239
240    /// Multiply the `i`-th element of `coeffs` with `c*g^i`.
241    #[cfg(not(feature = "serial"))]
242    fn distribute_powers_and_mul_by_const<T: DomainCoeff<F>>(coeffs: &mut [T], g: F, c: F) {
243        let min_parallel_chunk_size = 1024;
244        let num_cpus_available = max_available_threads();
245        let num_elem_per_thread = core::cmp::max(coeffs.len() / num_cpus_available, min_parallel_chunk_size);
246
247        cfg_chunks_mut!(coeffs, num_elem_per_thread).enumerate().for_each(|(i, chunk)| {
248            let offset = c * g.pow([(i * num_elem_per_thread) as u64]);
249            let mut pow = offset;
250            chunk.iter_mut().for_each(|coeff| {
251                *coeff *= pow;
252                pow *= &g
253            })
254        });
255    }
256
257    /// Evaluate all the lagrange polynomials defined by this domain at the
258    /// point `tau`.
259    pub fn evaluate_all_lagrange_coefficients(&self, tau: F) -> Vec<F> {
260        // Evaluate all Lagrange polynomials
261        let size = self.size as usize;
262        let t_size = tau.pow([self.size]);
263        let one = F::one();
264        if t_size.is_one() {
265            let mut u = vec![F::zero(); size];
266            let mut omega_i = one;
267            for x in u.iter_mut().take(size) {
268                if omega_i == tau {
269                    *x = one;
270                    break;
271                }
272                omega_i *= &self.group_gen;
273            }
274            u
275        } else {
276            let mut l = (t_size - one) * self.size_inv;
277            let mut r = one;
278            let mut u = vec![F::zero(); size];
279            let mut ls = vec![F::zero(); size];
280            for i in 0..size {
281                u[i] = tau - r;
282                ls[i] = l;
283                l *= &self.group_gen;
284                r *= &self.group_gen;
285            }
286
287            batch_inversion(u.as_mut_slice());
288            cfg_iter_mut!(u).zip_eq(ls).for_each(|(tau_minus_r, l)| {
289                *tau_minus_r = l * *tau_minus_r;
290            });
291            u
292        }
293    }
294
295    /// Return the sparse vanishing polynomial.
296    pub fn vanishing_polynomial(&self) -> SparsePolynomial<F> {
297        let coeffs = [(0, -F::one()), (self.size(), F::one())];
298        SparsePolynomial::from_coefficients(coeffs)
299    }
300
301    /// This evaluates the vanishing polynomial for this domain at tau.
302    /// For multiplicative subgroups, this polynomial is `z(X) = X^self.size -
303    /// 1`.
304    pub fn evaluate_vanishing_polynomial(&self, tau: F) -> F {
305        tau.pow([self.size]) - F::one()
306    }
307
308    /// Return an iterator over the elements of the domain.
309    pub fn elements(&self) -> Elements<F> {
310        Elements { cur_elem: F::one(), cur_pow: 0, domain: *self }
311    }
312
313    /// The target polynomial is the zero polynomial in our
314    /// evaluation domain, so we must perform division over
315    /// a coset.
316    pub fn divide_by_vanishing_poly_on_coset_in_place(&self, evals: &mut [F]) {
317        let i = self.evaluate_vanishing_polynomial(F::multiplicative_generator()).inverse().unwrap();
318
319        cfg_iter_mut!(evals).for_each(|eval| *eval *= &i);
320    }
321
322    /// Given an index in the `other` subdomain, return an index into this
323    /// domain `self` This assumes the `other`'s elements are also `self`'s
324    /// first elements
325    pub fn reindex_by_subdomain(&self, other: &Self, index: usize) -> Result<usize> {
326        ensure!(self.size() > other.size(), "other.size() must be smaller than self.size()");
327
328        // Let this subgroup be G, and the subgroup we're re-indexing by be S.
329        // Since its a subgroup, the 0th element of S is at index 0 in G, the first
330        // element of S is at index |G|/|S|, the second at 2*|G|/|S|, etc.
331        // Thus for an index i that corresponds to S, the index in G is i*|G|/|S|
332        let period = self.size() / other.size();
333        if index < other.size() {
334            Ok(index * period)
335        } else {
336            // Let i now be the index of this element in G \ S
337            // Let x be the number of elements in G \ S, for every element in S. Then x =
338            // (|G|/|S| - 1). At index i in G \ S, the number of elements in S
339            // that appear before the index in G to which i corresponds to, is
340            // floor(i / x) + 1. The +1 is because index 0 of G is S_0, so the
341            // position is offset by at least one. The floor(i / x) term is
342            // because after x elements in G \ S, there is one more element from S
343            // that will have appeared in G.
344            let i = index - other.size();
345            let x = period - 1;
346            Ok(i + (i / x) + 1)
347        }
348    }
349
350    /// Perform O(n) multiplication of two polynomials that are presented by
351    /// their evaluations in the domain.
352    /// Returns the evaluations of the product over the domain.
353    pub fn mul_polynomials_in_evaluation_domain(&self, self_evals: Vec<F>, other_evals: &[F]) -> Result<Vec<F>> {
354        let mut result = self_evals;
355
356        ensure!(result.len() == other_evals.len());
357        cfg_iter_mut!(result).zip_eq(other_evals).for_each(|(a, b)| *a *= b);
358
359        Ok(result)
360    }
361}
362
363impl<F: FftField> EvaluationDomain<F> {
364    pub fn precompute_fft(&self) -> FFTPrecomputation<F> {
365        execute_with_max_available_threads(|| FFTPrecomputation {
366            roots: self.roots_of_unity(self.group_gen),
367            domain: *self,
368        })
369    }
370
371    pub fn precompute_ifft(&self) -> IFFTPrecomputation<F> {
372        execute_with_max_available_threads(|| IFFTPrecomputation {
373            inverse_roots: self.roots_of_unity(self.group_gen_inv),
374            domain: *self,
375        })
376    }
377
378    pub(crate) fn in_order_fft_in_place<T: DomainCoeff<F>>(&self, x_s: &mut [T]) {
379        #[cfg(all(feature = "cuda", target_arch = "x86_64"))]
380        // SNP TODO: how to set threshold and check that the type is Fr
381        if self.size >= 32 && std::mem::size_of::<T>() == 32 {
382            let result = snarkvm_algorithms_cuda::NTT(
383                self.size as usize,
384                x_s,
385                snarkvm_algorithms_cuda::NTTInputOutputOrder::NN,
386                snarkvm_algorithms_cuda::NTTDirection::Forward,
387                snarkvm_algorithms_cuda::NTTType::Standard,
388            );
389            if result.is_ok() {
390                return;
391            }
392        }
393
394        let pc = self.precompute_fft();
395        self.fft_helper_in_place_with_pc(x_s, FFTOrder::II, &pc)
396    }
397
398    pub fn in_order_fft_with_pc<T: DomainCoeff<F>>(&self, x_s: &[T], pc: &FFTPrecomputation<F>) -> Vec<T> {
399        let mut x_s = x_s.to_vec();
400        if self.size() != x_s.len() {
401            x_s.extend(core::iter::repeat(T::zero()).take(self.size() - x_s.len()));
402        }
403        self.fft_helper_in_place_with_pc(&mut x_s, FFTOrder::II, pc);
404        x_s
405    }
406
407    pub(crate) fn in_order_ifft_in_place<T: DomainCoeff<F>>(&self, x_s: &mut [T]) {
408        #[cfg(all(feature = "cuda", target_arch = "x86_64"))]
409        // SNP TODO: how to set threshold
410        if self.size >= 32 && std::mem::size_of::<T>() == 32 {
411            let result = snarkvm_algorithms_cuda::NTT(
412                self.size as usize,
413                x_s,
414                snarkvm_algorithms_cuda::NTTInputOutputOrder::NN,
415                snarkvm_algorithms_cuda::NTTDirection::Inverse,
416                snarkvm_algorithms_cuda::NTTType::Standard,
417            );
418            if result.is_ok() {
419                return;
420            }
421        }
422
423        let pc = self.precompute_ifft();
424        self.ifft_helper_in_place_with_pc(x_s, FFTOrder::II, &pc);
425        cfg_iter_mut!(x_s).for_each(|val| *val *= self.size_inv);
426    }
427
428    pub(crate) fn in_order_coset_ifft_in_place<T: DomainCoeff<F>>(&self, x_s: &mut [T]) {
429        #[cfg(all(feature = "cuda", target_arch = "x86_64"))]
430        // SNP TODO: how to set threshold
431        if self.size >= 32 && std::mem::size_of::<T>() == 32 {
432            let result = snarkvm_algorithms_cuda::NTT(
433                self.size as usize,
434                x_s,
435                snarkvm_algorithms_cuda::NTTInputOutputOrder::NN,
436                snarkvm_algorithms_cuda::NTTDirection::Inverse,
437                snarkvm_algorithms_cuda::NTTType::Coset,
438            );
439            if result.is_ok() {
440                return;
441            }
442        }
443
444        let pc = self.precompute_ifft();
445        self.ifft_helper_in_place_with_pc(x_s, FFTOrder::II, &pc);
446        let coset_shift = self.generator_inv;
447        Self::distribute_powers_and_mul_by_const(x_s, coset_shift, self.size_inv);
448    }
449
450    #[allow(unused)]
451    pub(crate) fn in_order_fft_in_place_with_pc<T: DomainCoeff<F>>(
452        &self,
453        x_s: &mut [T],
454        pre_comp: &FFTPrecomputation<F>,
455    ) {
456        #[cfg(all(feature = "cuda", target_arch = "x86_64"))]
457        // SNP TODO: how to set threshold
458        if self.size >= 32 && std::mem::size_of::<T>() == 32 {
459            let result = snarkvm_algorithms_cuda::NTT(
460                self.size as usize,
461                x_s,
462                snarkvm_algorithms_cuda::NTTInputOutputOrder::NN,
463                snarkvm_algorithms_cuda::NTTDirection::Forward,
464                snarkvm_algorithms_cuda::NTTType::Standard,
465            );
466            if result.is_ok() {
467                return;
468            }
469        }
470
471        self.fft_helper_in_place_with_pc(x_s, FFTOrder::II, pre_comp)
472    }
473
474    pub(crate) fn out_order_fft_in_place_with_pc<T: DomainCoeff<F>>(
475        &self,
476        x_s: &mut [T],
477        pre_comp: &FFTPrecomputation<F>,
478    ) {
479        self.fft_helper_in_place_with_pc(x_s, FFTOrder::IO, pre_comp)
480    }
481
482    pub(crate) fn in_order_ifft_in_place_with_pc<T: DomainCoeff<F>>(
483        &self,
484        x_s: &mut [T],
485        pre_comp: &IFFTPrecomputation<F>,
486    ) {
487        #[cfg(all(feature = "cuda", target_arch = "x86_64"))]
488        // SNP TODO: how to set threshold
489        if self.size >= 32 && std::mem::size_of::<T>() == 32 {
490            let result = snarkvm_algorithms_cuda::NTT(
491                self.size as usize,
492                x_s,
493                snarkvm_algorithms_cuda::NTTInputOutputOrder::NN,
494                snarkvm_algorithms_cuda::NTTDirection::Inverse,
495                snarkvm_algorithms_cuda::NTTType::Standard,
496            );
497            if result.is_ok() {
498                return;
499            }
500        }
501
502        self.ifft_helper_in_place_with_pc(x_s, FFTOrder::II, pre_comp);
503        cfg_iter_mut!(x_s).for_each(|val| *val *= self.size_inv);
504    }
505
506    pub(crate) fn out_order_ifft_in_place_with_pc<T: DomainCoeff<F>>(
507        &self,
508        x_s: &mut [T],
509        pre_comp: &IFFTPrecomputation<F>,
510    ) {
511        self.ifft_helper_in_place_with_pc(x_s, FFTOrder::OI, pre_comp);
512        cfg_iter_mut!(x_s).for_each(|val| *val *= self.size_inv);
513    }
514
515    #[allow(unused)]
516    pub(crate) fn in_order_coset_ifft_in_place_with_pc<T: DomainCoeff<F>>(
517        &self,
518        x_s: &mut [T],
519        pre_comp: &IFFTPrecomputation<F>,
520    ) {
521        #[cfg(all(feature = "cuda", target_arch = "x86_64"))]
522        // SNP TODO: how to set threshold
523        if self.size >= 32 && std::mem::size_of::<T>() == 32 {
524            let result = snarkvm_algorithms_cuda::NTT(
525                self.size as usize,
526                x_s,
527                snarkvm_algorithms_cuda::NTTInputOutputOrder::NN,
528                snarkvm_algorithms_cuda::NTTDirection::Inverse,
529                snarkvm_algorithms_cuda::NTTType::Coset,
530            );
531            if result.is_ok() {
532                return;
533            }
534        }
535
536        self.ifft_helper_in_place_with_pc(x_s, FFTOrder::II, pre_comp);
537        let coset_shift = self.generator_inv;
538        Self::distribute_powers_and_mul_by_const(x_s, coset_shift, self.size_inv);
539    }
540
541    fn fft_helper_in_place_with_pc<T: DomainCoeff<F>>(
542        &self,
543        x_s: &mut [T],
544        ord: FFTOrder,
545        pre_comp: &FFTPrecomputation<F>,
546    ) {
547        use FFTOrder::*;
548        let pc = pre_comp.precomputation_for_subdomain(self).unwrap();
549
550        let log_len = log2(x_s.len());
551
552        if ord == OI {
553            self.oi_helper_with_roots(x_s, &pc.roots);
554        } else {
555            self.io_helper_with_roots(x_s, &pc.roots);
556        }
557
558        if ord == II {
559            derange_helper(x_s, log_len);
560        }
561    }
562
563    // Handles doing an IFFT with handling of being in order and out of order.
564    // The results here must all be divided by |x_s|,
565    // which is left up to the caller to do.
566    fn ifft_helper_in_place_with_pc<T: DomainCoeff<F>>(
567        &self,
568        x_s: &mut [T],
569        ord: FFTOrder,
570        pre_comp: &IFFTPrecomputation<F>,
571    ) {
572        use FFTOrder::*;
573        let pc = pre_comp.precomputation_for_subdomain(self).unwrap();
574
575        let log_len = log2(x_s.len());
576
577        if ord == II {
578            derange_helper(x_s, log_len);
579        }
580
581        if ord == IO {
582            self.io_helper_with_roots(x_s, &pc.inverse_roots);
583        } else {
584            self.oi_helper_with_roots(x_s, &pc.inverse_roots);
585        }
586    }
587
588    /// Computes the first `self.size / 2` roots of unity for the entire domain.
589    /// e.g. for the domain [1, g, g^2, ..., g^{n - 1}], it computes
590    // [1, g, g^2, ..., g^{(n/2) - 1}]
591    #[cfg(feature = "serial")]
592    pub fn roots_of_unity(&self, root: F) -> Vec<F> {
593        compute_powers_serial((self.size as usize) / 2, root)
594    }
595
596    /// Computes the first `self.size / 2` roots of unity.
597    #[cfg(not(feature = "serial"))]
598    pub fn roots_of_unity(&self, root: F) -> Vec<F> {
599        // TODO: check if this method can replace parallel compute powers.
600        let log_size = log2(self.size as usize);
601        // early exit for short inputs
602        if log_size <= LOG_ROOTS_OF_UNITY_PARALLEL_SIZE {
603            compute_powers_serial((self.size as usize) / 2, root)
604        } else {
605            let mut temp = root;
606            // w, w^2, w^4, w^8, ..., w^(2^(log_size - 1))
607            let log_powers: Vec<F> = (0..(log_size - 1))
608                .map(|_| {
609                    let old_value = temp;
610                    temp.square_in_place();
611                    old_value
612                })
613                .collect();
614
615            // allocate the return array and start the recursion
616            let mut powers = vec![F::zero(); 1 << (log_size - 1)];
617            Self::roots_of_unity_recursive(&mut powers, &log_powers);
618            powers
619        }
620    }
621
622    #[cfg(not(feature = "serial"))]
623    fn roots_of_unity_recursive(out: &mut [F], log_powers: &[F]) {
624        assert_eq!(out.len(), 1 << log_powers.len());
625        // base case: just compute the powers sequentially,
626        // g = log_powers[0], out = [1, g, g^2, ...]
627        if log_powers.len() <= LOG_ROOTS_OF_UNITY_PARALLEL_SIZE as usize {
628            out[0] = F::one();
629            for idx in 1..out.len() {
630                out[idx] = out[idx - 1] * log_powers[0];
631            }
632            return;
633        }
634
635        // recursive case:
636        // 1. split log_powers in half
637        let (lr_lo, lr_hi) = log_powers.split_at((1 + log_powers.len()) / 2);
638        let mut scr_lo = vec![F::default(); 1 << lr_lo.len()];
639        let mut scr_hi = vec![F::default(); 1 << lr_hi.len()];
640        // 2. compute each half individually
641        rayon::join(
642            || Self::roots_of_unity_recursive(&mut scr_lo, lr_lo),
643            || Self::roots_of_unity_recursive(&mut scr_hi, lr_hi),
644        );
645        // 3. recombine halves
646        // At this point, out is a blank slice.
647        out.par_chunks_mut(scr_lo.len()).zip(&scr_hi).for_each(|(out_chunk, scr_hi)| {
648            for (out_elem, scr_lo) in out_chunk.iter_mut().zip(&scr_lo) {
649                *out_elem = *scr_hi * scr_lo;
650            }
651        });
652    }
653
654    #[inline(always)]
655    fn butterfly_fn_io<T: DomainCoeff<F>>(((lo, hi), root): ((&mut T, &mut T), &F)) {
656        let neg = *lo - *hi;
657        *lo += *hi;
658        *hi = neg;
659        *hi *= *root;
660    }
661
662    #[inline(always)]
663    fn butterfly_fn_oi<T: DomainCoeff<F>>(((lo, hi), root): ((&mut T, &mut T), &F)) {
664        *hi *= *root;
665        let neg = *lo - *hi;
666        *lo += *hi;
667        *hi = neg;
668    }
669
670    #[allow(clippy::too_many_arguments)]
671    fn apply_butterfly<T: DomainCoeff<F>, G: Fn(((&mut T, &mut T), &F)) + Copy + Sync + Send>(
672        g: G,
673        xi: &mut [T],
674        roots: &[F],
675        step: usize,
676        chunk_size: usize,
677        num_chunks: usize,
678        max_threads: usize,
679        gap: usize,
680    ) {
681        cfg_chunks_mut!(xi, chunk_size).for_each(|cxi| {
682            let (lo, hi) = cxi.split_at_mut(gap);
683            // If the chunk is sufficiently big that parallelism helps,
684            // we parallelize the butterfly operation within the chunk.
685
686            if gap > MIN_GAP_SIZE_FOR_PARALLELISATION && num_chunks < max_threads {
687                cfg_iter_mut!(lo).zip(hi).zip(cfg_iter!(roots).step_by(step)).for_each(g);
688            } else {
689                lo.iter_mut().zip(hi).zip(roots.iter().step_by(step)).for_each(g);
690            }
691        });
692    }
693
694    #[allow(clippy::unnecessary_to_owned)]
695    fn io_helper_with_roots<T: DomainCoeff<F>>(&self, xi: &mut [T], roots: &[F]) {
696        let mut roots = std::borrow::Cow::Borrowed(roots);
697
698        let mut step = 1;
699        let mut first = true;
700
701        #[cfg(not(feature = "serial"))]
702        let max_threads = snarkvm_utilities::parallel::max_available_threads();
703        #[cfg(feature = "serial")]
704        let max_threads = 1;
705
706        let mut gap = xi.len() / 2;
707        while gap > 0 {
708            // each butterfly cluster uses 2*gap positions
709            let chunk_size = 2 * gap;
710            let num_chunks = xi.len() / chunk_size;
711
712            // Only compact roots to achieve cache locality/compactness if
713            // the roots lookup is done a significant amount of times
714            // Which also implies a large lookup stride.
715            if num_chunks >= MIN_NUM_CHUNKS_FOR_COMPACTION {
716                if !first {
717                    roots = Cow::Owned(cfg_into_iter!(roots.into_owned()).step_by(step * 2).collect());
718                }
719                step = 1;
720                roots.to_mut().shrink_to_fit();
721            } else {
722                step = num_chunks;
723            }
724            first = false;
725
726            Self::apply_butterfly(
727                Self::butterfly_fn_io,
728                xi,
729                &roots[..],
730                step,
731                chunk_size,
732                num_chunks,
733                max_threads,
734                gap,
735            );
736
737            gap /= 2;
738        }
739    }
740
741    fn oi_helper_with_roots<T: DomainCoeff<F>>(&self, xi: &mut [T], roots_cache: &[F]) {
742        // The `cmp::min` is only necessary for the case where
743        // `MIN_NUM_CHUNKS_FOR_COMPACTION = 1`. Else, notice that we compact
744        // the roots cache by a stride of at least `MIN_NUM_CHUNKS_FOR_COMPACTION`.
745
746        let compaction_max_size =
747            core::cmp::min(roots_cache.len() / 2, roots_cache.len() / MIN_NUM_CHUNKS_FOR_COMPACTION);
748        let mut compacted_roots = vec![F::default(); compaction_max_size];
749
750        #[cfg(not(feature = "serial"))]
751        let max_threads = snarkvm_utilities::parallel::max_available_threads();
752        #[cfg(feature = "serial")]
753        let max_threads = 1;
754
755        let mut gap = 1;
756        while gap < xi.len() {
757            // each butterfly cluster uses 2*gap positions
758            let chunk_size = 2 * gap;
759            let num_chunks = xi.len() / chunk_size;
760
761            // Only compact roots to achieve cache locality/compactness if
762            // the roots lookup is done a significant amount of times
763            // Which also implies a large lookup stride.
764            let (roots, step) = if num_chunks >= MIN_NUM_CHUNKS_FOR_COMPACTION && gap < xi.len() / 2 {
765                cfg_iter_mut!(compacted_roots[..gap])
766                    .zip(cfg_iter!(roots_cache[..(gap * num_chunks)]).step_by(num_chunks))
767                    .for_each(|(a, b)| *a = *b);
768                (&compacted_roots[..gap], 1)
769            } else {
770                (roots_cache, num_chunks)
771            };
772
773            Self::apply_butterfly(Self::butterfly_fn_oi, xi, roots, step, chunk_size, num_chunks, max_threads, gap);
774
775            gap *= 2;
776        }
777    }
778}
779
780/// The minimum number of chunks at which root compaction
781/// is beneficial.
782const MIN_NUM_CHUNKS_FOR_COMPACTION: usize = 1 << 7;
783
784/// The minimum size of a chunk at which parallelization of `butterfly`s is
785/// beneficial. This value was chosen empirically.
786const MIN_GAP_SIZE_FOR_PARALLELISATION: usize = 1 << 10;
787
788// minimum size at which to parallelize.
789#[cfg(not(feature = "serial"))]
790const LOG_ROOTS_OF_UNITY_PARALLEL_SIZE: u32 = 7;
791
792#[inline]
793pub(super) fn bitrev(a: u64, log_len: u32) -> u64 {
794    a.reverse_bits() >> (64 - log_len)
795}
796
797pub(crate) fn derange<T>(xi: &mut [T]) {
798    derange_helper(xi, log2(xi.len()))
799}
800
801fn derange_helper<T>(xi: &mut [T], log_len: u32) {
802    for idx in 1..(xi.len() as u64 - 1) {
803        let ridx = bitrev(idx, log_len);
804        if idx < ridx {
805            xi.swap(idx as usize, ridx as usize);
806        }
807    }
808}
809
810#[derive(PartialEq, Eq, Debug)]
811enum FFTOrder {
812    /// Both the input and the output of the FFT must be in-order.
813    II,
814    /// The input of the FFT must be in-order, but the output does not have to
815    /// be.
816    IO,
817    /// The input of the FFT can be out of order, but the output must be
818    /// in-order.
819    OI,
820}
821
822pub(crate) fn compute_powers_serial<F: Field>(size: usize, root: F) -> Vec<F> {
823    compute_powers_and_mul_by_const_serial(size, root, F::one())
824}
825
826pub(crate) fn compute_powers_and_mul_by_const_serial<F: Field>(size: usize, root: F, c: F) -> Vec<F> {
827    let mut value = c;
828    (0..size)
829        .map(|_| {
830            let old_value = value;
831            value *= root;
832            old_value
833        })
834        .collect()
835}
836
837#[allow(unused)]
838#[cfg(not(feature = "serial"))]
839pub(crate) fn compute_powers<F: Field>(size: usize, g: F) -> Vec<F> {
840    if size < MIN_PARALLEL_CHUNK_SIZE {
841        return compute_powers_serial(size, g);
842    }
843    // compute the number of threads we will be using.
844    let num_cpus_available = max_available_threads();
845    let num_elem_per_thread = core::cmp::max(size / num_cpus_available, MIN_PARALLEL_CHUNK_SIZE);
846    let num_cpus_used = size / num_elem_per_thread;
847
848    // Split up the powers to compute across each thread evenly.
849    let res: Vec<F> = (0..num_cpus_used)
850        .into_par_iter()
851        .flat_map(|i| {
852            let offset = g.pow([(i * num_elem_per_thread) as u64]);
853            // Compute the size that this chunks' output should be
854            // (num_elem_per_thread, unless there are less than num_elem_per_thread elements
855            // remaining)
856            let num_elements_to_compute = core::cmp::min(size - i * num_elem_per_thread, num_elem_per_thread);
857            compute_powers_and_mul_by_const_serial(num_elements_to_compute, g, offset)
858        })
859        .collect();
860    res
861}
862
863/// An iterator over the elements of the domain.
864#[derive(Clone)]
865pub struct Elements<F: FftField> {
866    cur_elem: F,
867    cur_pow: u64,
868    domain: EvaluationDomain<F>,
869}
870
871impl<F: FftField> Iterator for Elements<F> {
872    type Item = F;
873
874    fn next(&mut self) -> Option<F> {
875        if self.cur_pow == self.domain.size {
876            None
877        } else {
878            let cur_elem = self.cur_elem;
879            self.cur_elem *= &self.domain.group_gen;
880            self.cur_pow += 1;
881            Some(cur_elem)
882        }
883    }
884}
885
886/// An iterator over the elements of the domain.
887#[derive(Clone, Eq, PartialEq, Debug, CanonicalDeserialize, CanonicalSerialize)]
888pub struct FFTPrecomputation<F: FftField> {
889    roots: Vec<F>,
890    domain: EvaluationDomain<F>,
891}
892
893impl<F: FftField> FFTPrecomputation<F> {
894    pub fn to_ifft_precomputation(&self) -> IFFTPrecomputation<F> {
895        let mut inverse_roots = self.roots.clone();
896        snarkvm_fields::batch_inversion(&mut inverse_roots);
897        IFFTPrecomputation { inverse_roots, domain: self.domain }
898    }
899
900    pub fn precomputation_for_subdomain<'a>(&'a self, domain: &EvaluationDomain<F>) -> Option<Cow<'a, Self>> {
901        if domain.size() == 1 {
902            return Some(Cow::Owned(Self { roots: vec![], domain: *domain }));
903        }
904        if &self.domain == domain {
905            Some(Cow::Borrowed(self))
906        } else if domain.size() < self.domain.size() {
907            let size_ratio = self.domain.size() / domain.size();
908            let roots = self.roots.iter().step_by(size_ratio).copied().collect();
909            Some(Cow::Owned(Self { roots, domain: *domain }))
910        } else {
911            None
912        }
913    }
914}
915
916/// An iterator over the elements of the domain.
917#[derive(Clone, Eq, PartialEq, Debug, CanonicalSerialize, CanonicalDeserialize)]
918pub struct IFFTPrecomputation<F: FftField> {
919    inverse_roots: Vec<F>,
920    domain: EvaluationDomain<F>,
921}
922
923impl<F: FftField> IFFTPrecomputation<F> {
924    pub fn precomputation_for_subdomain<'a>(&'a self, domain: &EvaluationDomain<F>) -> Option<Cow<'a, Self>> {
925        if domain.size() == 1 {
926            return Some(Cow::Owned(Self { inverse_roots: vec![], domain: *domain }));
927        }
928        if &self.domain == domain {
929            Some(Cow::Borrowed(self))
930        } else if domain.size() < self.domain.size() {
931            let size_ratio = self.domain.size() / domain.size();
932            let inverse_roots = self.inverse_roots.iter().step_by(size_ratio).copied().collect();
933            Some(Cow::Owned(Self { inverse_roots, domain: *domain }))
934        } else {
935            None
936        }
937    }
938}
939
940#[cfg(test)]
941mod tests {
942    #[cfg(all(feature = "cuda", target_arch = "x86_64"))]
943    use crate::fft::domain::FFTOrder;
944    use crate::fft::{DensePolynomial, EvaluationDomain};
945    use rand::Rng;
946    use snarkvm_curves::bls12_377::Fr;
947    use snarkvm_fields::{FftField, Field, One, Zero};
948    use snarkvm_utilities::{TestRng, Uniform};
949
950    #[test]
951    fn vanishing_polynomial_evaluation() {
952        let rng = &mut TestRng::default();
953        for coeffs in 0..10 {
954            let domain = EvaluationDomain::<Fr>::new(coeffs).unwrap();
955            let z = domain.vanishing_polynomial();
956            for _ in 0..100 {
957                let point = rng.gen();
958                assert_eq!(z.evaluate(point), domain.evaluate_vanishing_polynomial(point))
959            }
960        }
961    }
962
963    #[test]
964    fn vanishing_polynomial_vanishes_on_domain() {
965        for coeffs in 0..1000 {
966            let domain = EvaluationDomain::<Fr>::new(coeffs).unwrap();
967            let z = domain.vanishing_polynomial();
968            for point in domain.elements() {
969                assert!(z.evaluate(point).is_zero())
970            }
971        }
972    }
973
974    #[test]
975    fn size_of_elements() {
976        for coeffs in 1..10 {
977            let size = 1 << coeffs;
978            let domain = EvaluationDomain::<Fr>::new(size).unwrap();
979            let domain_size = domain.size();
980            assert_eq!(domain_size, domain.elements().count());
981        }
982    }
983
984    #[test]
985    fn elements_contents() {
986        for coeffs in 1..10 {
987            let size = 1 << coeffs;
988            let domain = EvaluationDomain::<Fr>::new(size).unwrap();
989            for (i, element) in domain.elements().enumerate() {
990                assert_eq!(element, domain.group_gen.pow([i as u64]));
991            }
992        }
993    }
994
995    /// Test that lagrange interpolation for a random polynomial at a random
996    /// point works.
997    #[test]
998    fn non_systematic_lagrange_coefficients_test() {
999        let mut rng = TestRng::default();
1000        for domain_dimension in 1..10 {
1001            let domain_size = 1 << domain_dimension;
1002            let domain = EvaluationDomain::<Fr>::new(domain_size).unwrap();
1003            // Get random point & lagrange coefficients
1004            let random_point = Fr::rand(&mut rng);
1005            let lagrange_coefficients = domain.evaluate_all_lagrange_coefficients(random_point);
1006
1007            // Sample the random polynomial, evaluate it over the domain and the random
1008            // point.
1009            let random_polynomial = DensePolynomial::<Fr>::rand(domain_size - 1, &mut rng);
1010            let polynomial_evaluations = domain.fft(random_polynomial.coeffs());
1011            let actual_evaluations = random_polynomial.evaluate(random_point);
1012
1013            // Do lagrange interpolation, and compare against the actual evaluation
1014            let mut interpolated_evaluation = Fr::zero();
1015            for i in 0..domain_size {
1016                interpolated_evaluation += lagrange_coefficients[i] * polynomial_evaluations[i];
1017            }
1018            assert_eq!(actual_evaluations, interpolated_evaluation);
1019        }
1020    }
1021
1022    /// Test that lagrange coefficients for a point in the domain is correct.
1023    #[test]
1024    fn systematic_lagrange_coefficients_test() {
1025        // This runs in time O(N^2) in the domain size, so keep the domain dimension
1026        // low. We generate lagrange coefficients for each element in the
1027        // domain.
1028        for domain_dimension in 1..5 {
1029            let domain_size = 1 << domain_dimension;
1030            let domain = EvaluationDomain::<Fr>::new(domain_size).unwrap();
1031            let all_domain_elements: Vec<Fr> = domain.elements().collect();
1032            for (i, domain_element) in all_domain_elements.iter().enumerate().take(domain_size) {
1033                let lagrange_coefficients = domain.evaluate_all_lagrange_coefficients(*domain_element);
1034                for (j, lagrange_coefficient) in lagrange_coefficients.iter().enumerate().take(domain_size) {
1035                    // Lagrange coefficient for the evaluation point, which should be 1
1036                    if i == j {
1037                        assert_eq!(*lagrange_coefficient, Fr::one());
1038                    } else {
1039                        assert_eq!(*lagrange_coefficient, Fr::zero());
1040                    }
1041                }
1042            }
1043        }
1044    }
1045
1046    /// Tests that the roots of unity result is the same as domain.elements().
1047    #[test]
1048    fn test_roots_of_unity() {
1049        let max_degree = 10;
1050        for log_domain_size in 0..max_degree {
1051            let domain_size = 1 << log_domain_size;
1052            let domain = EvaluationDomain::<Fr>::new(domain_size).unwrap();
1053            let actual_roots = domain.roots_of_unity(domain.group_gen);
1054            for &value in &actual_roots {
1055                assert!(domain.evaluate_vanishing_polynomial(value).is_zero());
1056            }
1057            let expected_roots_elements = domain.elements();
1058            for (expected, &actual) in expected_roots_elements.zip(&actual_roots) {
1059                assert_eq!(expected, actual);
1060            }
1061            assert_eq!(actual_roots.len(), domain_size / 2);
1062        }
1063    }
1064
1065    /// Tests that the FFTs output the correct result.
1066    #[test]
1067    fn test_fft_correctness() {
1068        // This assumes a correct polynomial evaluation at point procedure.
1069        // It tests consistency of FFT/IFFT, and coset_fft/coset_ifft,
1070        // along with testing that each individual evaluation is correct.
1071
1072        let mut rng = TestRng::default();
1073
1074        // Runs in time O(degree^2)
1075        let log_degree = 5;
1076        let degree = 1 << log_degree;
1077        let random_polynomial = DensePolynomial::<Fr>::rand(degree - 1, &mut rng);
1078
1079        for log_domain_size in log_degree..(log_degree + 2) {
1080            let domain_size = 1 << log_domain_size;
1081            let domain = EvaluationDomain::<Fr>::new(domain_size).unwrap();
1082            let polynomial_evaluations = domain.fft(&random_polynomial.coeffs);
1083            let polynomial_coset_evaluations = domain.coset_fft(&random_polynomial.coeffs);
1084            for (i, x) in domain.elements().enumerate() {
1085                let coset_x = Fr::multiplicative_generator() * x;
1086
1087                assert_eq!(polynomial_evaluations[i], random_polynomial.evaluate(x));
1088                assert_eq!(polynomial_coset_evaluations[i], random_polynomial.evaluate(coset_x));
1089            }
1090
1091            let randon_polynomial_from_subgroup =
1092                DensePolynomial::from_coefficients_vec(domain.ifft(&polynomial_evaluations));
1093            let random_polynomial_from_coset =
1094                DensePolynomial::from_coefficients_vec(domain.coset_ifft(&polynomial_coset_evaluations));
1095
1096            assert_eq!(
1097                random_polynomial, randon_polynomial_from_subgroup,
1098                "degree = {degree}, domain size = {domain_size}"
1099            );
1100            assert_eq!(
1101                random_polynomial, random_polynomial_from_coset,
1102                "degree = {degree}, domain size = {domain_size}"
1103            );
1104        }
1105    }
1106
1107    /// Tests that FFT precomputation is correctly subdomained
1108    #[test]
1109    fn test_fft_precomputation() {
1110        for i in 1..10 {
1111            let big_domain = EvaluationDomain::<Fr>::new(i).unwrap();
1112            let pc = big_domain.precompute_fft();
1113            for j in 1..i {
1114                let small_domain = EvaluationDomain::<Fr>::new(j).unwrap();
1115                let small_pc = small_domain.precompute_fft();
1116                assert_eq!(pc.precomputation_for_subdomain(&small_domain).unwrap().as_ref(), &small_pc);
1117            }
1118        }
1119    }
1120
1121    /// Tests that IFFT precomputation is correctly subdomained
1122    #[test]
1123    fn test_ifft_precomputation() {
1124        for i in 1..10 {
1125            let big_domain = EvaluationDomain::<Fr>::new(i).unwrap();
1126            let pc = big_domain.precompute_ifft();
1127            for j in 1..i {
1128                let small_domain = EvaluationDomain::<Fr>::new(j).unwrap();
1129                let small_pc = small_domain.precompute_ifft();
1130                assert_eq!(pc.precomputation_for_subdomain(&small_domain).unwrap().as_ref(), &small_pc);
1131            }
1132        }
1133    }
1134
1135    /// Tests that IFFT precomputation can be correctly computed from
1136    /// FFT precomputation
1137    #[test]
1138    fn test_ifft_precomputation_from_fft() {
1139        for i in 1..10 {
1140            let domain = EvaluationDomain::<Fr>::new(i).unwrap();
1141            let pc = domain.precompute_ifft();
1142            let fft_pc = domain.precompute_fft();
1143            assert_eq!(pc, fft_pc.to_ifft_precomputation())
1144        }
1145    }
1146
1147    /// Tests that the FFTs output the correct result.
1148    #[cfg(all(feature = "cuda", target_arch = "x86_64"))]
1149    #[test]
1150    fn test_fft_correctness_cuda() {
1151        let mut rng = TestRng::default();
1152        for log_domain in 2..20 {
1153            println!("Testing domain size {log_domain}");
1154            let domain_size = 1 << log_domain;
1155            let random_polynomial = DensePolynomial::<Fr>::rand(domain_size - 1, &mut rng);
1156            let mut polynomial_evaluations = random_polynomial.coeffs.clone();
1157            let mut polynomial_evaluations_cuda = random_polynomial.coeffs.clone();
1158
1159            let domain = EvaluationDomain::<Fr>::new(domain_size).unwrap();
1160            let pc = domain.precompute_fft();
1161            domain.fft_helper_in_place_with_pc(&mut polynomial_evaluations, FFTOrder::II, &pc);
1162
1163            if snarkvm_algorithms_cuda::NTT::<Fr>(
1164                domain_size,
1165                &mut polynomial_evaluations_cuda,
1166                snarkvm_algorithms_cuda::NTTInputOutputOrder::NN,
1167                snarkvm_algorithms_cuda::NTTDirection::Forward,
1168                snarkvm_algorithms_cuda::NTTType::Standard,
1169            )
1170            .is_err()
1171            {
1172                println!("cuda error!");
1173            }
1174
1175            assert_eq!(polynomial_evaluations, polynomial_evaluations_cuda, "domain size = {domain_size}");
1176
1177            // iNTT
1178            if snarkvm_algorithms_cuda::NTT::<Fr>(
1179                domain_size,
1180                &mut polynomial_evaluations_cuda,
1181                snarkvm_algorithms_cuda::NTTInputOutputOrder::NN,
1182                snarkvm_algorithms_cuda::NTTDirection::Inverse,
1183                snarkvm_algorithms_cuda::NTTType::Standard,
1184            )
1185            .is_err()
1186            {
1187                println!("cuda error!");
1188            }
1189            assert_eq!(random_polynomial.coeffs, polynomial_evaluations_cuda, "domain size = {domain_size}");
1190
1191            // Coset NTT
1192            polynomial_evaluations = random_polynomial.coeffs.clone();
1193            let domain = EvaluationDomain::<Fr>::new(domain_size).unwrap();
1194            let pc = domain.precompute_fft();
1195            EvaluationDomain::<Fr>::distribute_powers(&mut polynomial_evaluations, Fr::multiplicative_generator());
1196            domain.fft_helper_in_place_with_pc(&mut polynomial_evaluations, FFTOrder::II, &pc);
1197
1198            if snarkvm_algorithms_cuda::NTT::<Fr>(
1199                domain_size,
1200                &mut polynomial_evaluations_cuda,
1201                snarkvm_algorithms_cuda::NTTInputOutputOrder::NN,
1202                snarkvm_algorithms_cuda::NTTDirection::Forward,
1203                snarkvm_algorithms_cuda::NTTType::Coset,
1204            )
1205            .is_err()
1206            {
1207                println!("cuda error!");
1208            }
1209
1210            assert_eq!(polynomial_evaluations, polynomial_evaluations_cuda, "domain size = {domain_size}");
1211
1212            // Coset iNTT
1213            if snarkvm_algorithms_cuda::NTT::<Fr>(
1214                domain_size,
1215                &mut polynomial_evaluations_cuda,
1216                snarkvm_algorithms_cuda::NTTInputOutputOrder::NN,
1217                snarkvm_algorithms_cuda::NTTDirection::Inverse,
1218                snarkvm_algorithms_cuda::NTTType::Coset,
1219            )
1220            .is_err()
1221            {
1222                println!("cuda error!");
1223            }
1224            assert_eq!(random_polynomial.coeffs, polynomial_evaluations_cuda, "domain size = {domain_size}");
1225        }
1226    }
1227}