bellman_ce/plonk/polynomials/
mod.rs

1use crate::pairing::ff::PrimeField;
2use crate::plonk::domains::*;
3use crate::SynthesisError;
4use crate::worker::*;
5use crate::plonk::fft::*;
6use crate::plonk::fft::with_precomputation;
7use crate::plonk::fft::with_precomputation::FftPrecomputations;
8
9use crate::plonk::fft::cooley_tukey_ntt;
10use crate::plonk::fft::cooley_tukey_ntt::CTPrecomputations;
11use crate::plonk::fft::cooley_tukey_ntt::partial_reduction;
12
13use crate::plonk::transparent_engine::PartialTwoBitReductionField;
14
15pub trait PolynomialForm: Sized + Copy + Clone {}
16
17#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)]
18pub enum Coefficients { }
19
20#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)]
21pub enum Values { }
22
23impl PolynomialForm for Coefficients {}
24impl PolynomialForm for Values{}
25
26// TODO: Enforce bitreversed values as a separate form
27
28#[derive(Clone, PartialEq, Eq, Debug, serde::Serialize, serde::Deserialize)]
29#[serde(bound(deserialize = "F: serde::de::DeserializeOwned"))]
30pub struct Polynomial<F: PrimeField, P: PolynomialForm> {
31    coeffs: Vec<F>,
32    pub exp: u32,
33    pub omega: F,
34    pub omegainv: F,
35    pub geninv: F,
36    pub minv: F,
37
38    #[serde(skip_serializing, default)]
39    #[serde(bound(serialize = ""))]
40    #[serde(bound(deserialize = ""))]
41    _marker: std::marker::PhantomData<P>
42}
43
44
45impl<F: PrimeField, P: PolynomialForm> Polynomial<F, P> {
46    pub fn size(&self) -> usize {
47        self.coeffs.len()
48    }
49
50    pub fn as_ref(&self) -> &[F] {
51        &self.coeffs
52    }
53
54    pub fn as_mut(&mut self) -> &mut [F] {
55        &mut self.coeffs
56    }
57
58    pub fn into_coeffs(self) -> Vec<F> {
59        self.coeffs
60    }
61
62    pub fn distribute_powers(&mut self, worker: &Worker, g: F)
63    {
64        distribute_powers(&mut self.coeffs, &worker, g);
65    }
66
67    pub fn reuse_allocation<PP: PolynomialForm> (&mut self, other: &Polynomial<F, PP>) {
68        assert_eq!(self.coeffs.len(), other.coeffs.len());
69        self.coeffs.copy_from_slice(&other.coeffs);
70    }
71
72    pub fn bitreverse_enumeration(&mut self, worker: &Worker) {
73        let total_len = self.coeffs.len();
74        let log_n = self.exp as usize;
75        if total_len <= worker.cpus {
76            for j in 0..total_len {
77                let rj = cooley_tukey_ntt::bitreverse(j, log_n);
78                if j < rj {
79                    self.coeffs.swap(j, rj);
80                }  
81            }
82
83            return;
84        }
85
86        let r = &mut self.coeffs[..] as *mut [F];
87
88        let to_spawn = worker.cpus;
89
90        let chunk = Worker::chunk_size_for_num_spawned_threads(total_len, to_spawn);
91
92        // while it's unsafe we don't care cause swapping is always one to one
93
94        worker.scope(0, |scope, _| {
95            for thread_id in 0..to_spawn {
96                let r = unsafe {&mut *r};
97                scope.spawn(move |_| {
98                    let start = thread_id * chunk;
99                    let end = if start + chunk <= total_len {
100                        start + chunk
101                    } else {
102                        total_len
103                    };
104                    for j in start..end {
105                        let rj = cooley_tukey_ntt::bitreverse(j, log_n);
106                        if j < rj {
107                            r.swap(j, rj);
108                        }  
109                    }
110                });
111            }
112        });
113    }
114
115    pub fn scale(&mut self, worker: &Worker, g: F)
116    {
117        if g == F::one() {
118            return;
119        }
120
121        worker.scope(self.coeffs.len(), |scope, chunk| {
122            for v in self.coeffs.chunks_mut(chunk) {
123                scope.spawn(move |_| {
124                    for v in v.iter_mut() {
125                        v.mul_assign(&g);
126                    }
127                });
128            }
129        });
130    }
131
132    pub fn negate(&mut self, worker: &Worker)
133    {
134        worker.scope(self.coeffs.len(), |scope, chunk| {
135            for v in self.coeffs.chunks_mut(chunk) {
136                scope.spawn(move |_| {
137                    for v in v.iter_mut() {
138                        v.negate();
139                    }
140                });
141            }
142        });
143    }
144
145    pub fn map<M: Fn(&mut F) -> () + Send + Copy>(&mut self, worker: &Worker, func: M)
146    {
147        worker.scope(self.coeffs.len(), |scope, chunk| {
148            for v in self.coeffs.chunks_mut(chunk) {
149                scope.spawn(move |_| {
150                    for v in v.iter_mut() {
151                        func(v);
152                    }
153                });
154            }
155        });
156    }
157
158    pub fn map_indexed<M: Fn(usize, &mut F) -> () + Send + Copy>(&mut self, worker: &Worker, func: M)
159    {
160        worker.scope(self.coeffs.len(), |scope, chunk| {
161            for (chunk_idx, v) in self.coeffs.chunks_mut(chunk).enumerate() {
162                scope.spawn(move |_| {
163                    let base = chunk_idx * chunk;
164                    for (in_chunk_idx, v) in v.iter_mut().enumerate() {
165                        func(base + in_chunk_idx, v);
166                    }
167                });
168            }
169        });
170    }
171
172    pub fn pad_by_factor(&mut self, factor: usize) -> Result<(), SynthesisError> {
173        debug_assert!(self.coeffs.len().is_power_of_two());
174        
175        if factor == 1 {
176            return Ok(());
177        }
178        let next_power_of_two = factor.next_power_of_two();
179        if factor != next_power_of_two {
180            return Err(SynthesisError::Unsatisfiable);
181        }
182
183        let new_size = self.coeffs.len() * factor;
184        self.coeffs.resize(new_size, F::zero());
185
186        let domain = Domain::new_for_size(new_size as u64)?;
187        self.exp = domain.power_of_two as u32;
188        let m = domain.size as usize;
189        self.omega = domain.generator;
190        self.omegainv = self.omega.inverse().unwrap();
191        self.minv = F::from_str(&format!("{}", m)).unwrap().inverse().unwrap();
192
193        Ok(())
194    }
195
196    pub fn pad_to_size(&mut self, new_size: usize) -> Result<(), SynthesisError> {
197        if new_size < self.coeffs.len() {
198            return Err(SynthesisError::PolynomialDegreeTooLarge);
199        }
200        let next_power_of_two = new_size.next_power_of_two();
201        if new_size != next_power_of_two {
202            return Err(SynthesisError::Unsatisfiable);
203        }
204        self.coeffs.resize(new_size, F::zero());
205
206        let domain = Domain::new_for_size(new_size as u64)?;
207        self.exp = domain.power_of_two as u32;
208        let m = domain.size as usize;
209        self.omega = domain.generator;
210        self.omegainv = self.omega.inverse().unwrap();
211        self.minv = F::from_str(&format!("{}", m)).unwrap().inverse().unwrap();
212
213        Ok(())
214    }
215
216    pub fn pad_to_domain(&mut self) -> Result<(), SynthesisError> {
217        let domain = Domain::<F>::new_for_size(self.coeffs.len() as u64)?;
218        self.coeffs.resize(domain.size as usize, F::zero());
219
220        Ok(())
221    }
222
223    pub fn clone_padded_to_domain(&self) -> Result<Self, SynthesisError> {
224        let mut padded = self.clone();
225        let domain = Domain::<F>::new_for_size(self.coeffs.len() as u64)?;
226        padded.coeffs.resize(domain.size as usize, F::zero());
227
228        Ok(padded)
229    }
230
231    pub fn trim_to_degree(&mut self, degree: usize) -> Result<(), SynthesisError> {
232        let size = self.coeffs.len();
233        if size <= degree + 1 {
234            return Ok(());
235        }
236        self.coeffs.truncate(degree + 1);
237        self.coeffs.resize(size, F::zero());
238
239        Ok(())
240    }
241}
242
243impl<F: PrimeField> Polynomial<F, Coefficients> {
244    pub fn new_for_size(size: usize) -> Result<Polynomial<F, Coefficients>, SynthesisError> {
245        let coeffs = vec![F::zero(); size];
246
247        Self::from_coeffs(coeffs)
248    }
249
250    pub fn from_coeffs(mut coeffs: Vec<F>) -> Result<Polynomial<F, Coefficients>, SynthesisError>
251    {
252        let coeffs_len = coeffs.len();
253
254        let domain = Domain::new_for_size(coeffs_len as u64)?;
255        let exp = domain.power_of_two as u32;
256        let m = domain.size as usize;
257        let omega = domain.generator;
258
259        coeffs.resize(m, F::zero());
260
261        Ok(Polynomial::<F, Coefficients> {
262            coeffs: coeffs,
263            exp: exp,
264            omega: omega,
265            omegainv: omega.inverse().unwrap(),
266            geninv: F::multiplicative_generator().inverse().unwrap(),
267            minv: F::from_str(&format!("{}", m)).unwrap().inverse().unwrap(),
268            _marker: std::marker::PhantomData
269        })
270    }
271
272    pub fn from_roots(roots: Vec<F>, worker: &Worker) -> Result<Polynomial<F, Coefficients>, SynthesisError>
273    {
274
275        let coeffs_len = roots.len() + 1;
276
277        let domain = Domain::<F>::new_for_size(coeffs_len as u64)?;
278        let num_threads = worker.cpus;
279
280        // vector of vectors of polynomial coefficients for subproducts
281        let mut subterms = vec![vec![]; num_threads];
282
283        worker.scope(roots.len(), |scope, chunk| {
284            for (r, s) in roots.chunks(chunk)
285                    .zip(subterms.chunks_mut(1)) {
286                scope.spawn(move |_| {
287                    for r in r.iter() {
288                        if s[0].len() == 0 {
289                            let mut tmp = *r;
290                            tmp.negate();
291                            s[0] = vec![tmp, F::one()];
292                        } else {
293                            let mut tmp = Vec::with_capacity(s[0].len() + 1);
294                            tmp.push(F::zero());
295                            tmp.extend(s[0].clone());
296                            for (c, n) in s[0].iter().zip(tmp.iter_mut()) {
297                                let mut t = *c;
298                                t.mul_assign(&r);
299                                n.sub_assign(&t);
300                            }
301                            s[0] = tmp;
302                        }
303                    }
304                });
305            }
306        });
307
308        // now we have subproducts in a coefficient form
309
310        let mut result: Option<Polynomial<F, Values>> = None;
311        let result_len = domain.size as usize;
312
313        for s in subterms.into_iter() {
314            if s.len() == 0 {
315                continue;
316            }
317            let t = Polynomial::<F, Coefficients>::from_coeffs(s)?;
318            let factor = result_len / t.size();
319            let t = t.lde(&worker, factor)?;
320            if let Some(res) = result.as_mut() {
321                res.mul_assign(&worker, &t);
322            } else {
323                result = Some(t);
324            }
325        }
326
327        let result = result.expect("is some");
328        let result = result.ifft(&worker);
329
330        Ok(result)
331    }
332
333    pub fn evaluate_at_domain_for_degree_one(
334        &self, 
335        worker: &Worker, 
336        domain_size: u64
337    ) -> Result<Polynomial<F, Values>, SynthesisError> {
338        assert_eq!(self.coeffs.len(), 2);
339        let alpha = self.coeffs[1];
340        let c = self.coeffs[0];
341
342        let domain = Domain::<F>::new_for_size(domain_size)?;
343
344        let mut result = vec![F::zero(); domain.size as usize];
345        let g = domain.generator;
346        worker.scope(result.len(), |scope, chunk| {
347            for (i, v) in result.chunks_mut(chunk).enumerate() {
348                scope.spawn(move |_| {
349                    let mut u = g.pow(&[(i * chunk) as u64]);
350                    for v in v.iter_mut() {
351                        let mut tmp = alpha;
352                        tmp.mul_assign(&u);
353                        tmp.add_assign(&c);
354                        *v = tmp;
355                        u.mul_assign(&g);
356                    }
357                });
358            }
359        });
360
361        Polynomial::from_values(result)
362    }
363
364    pub fn coset_evaluate_at_domain_for_degree_one(
365        &self, 
366        worker: &Worker, 
367        domain_size: u64
368    ) -> Result<Polynomial<F, Values>, SynthesisError> {
369        assert_eq!(self.coeffs.len(), 2);
370        let alpha = self.coeffs[1];
371        let c = self.coeffs[0];
372
373        let domain = Domain::<F>::new_for_size(domain_size)?;
374
375        let mut result = vec![F::zero(); domain.size as usize];
376        let g = domain.generator;
377        worker.scope(result.len(), |scope, chunk| {
378            for (i, v) in result.chunks_mut(chunk).enumerate() {
379                scope.spawn(move |_| {
380                    let mut u = g.pow(&[(i * chunk) as u64]);
381                    u.mul_assign(&F::multiplicative_generator());
382                    for v in v.iter_mut() {
383                        let mut tmp = alpha;
384                        tmp.mul_assign(&u);
385                        tmp.add_assign(&c);
386                        *v = tmp;
387                        u.mul_assign(&g);
388                    }
389                });
390            }
391        });
392
393        Polynomial::from_values(result)
394    }
395
396    // pub fn sparse_distribute(&mut self, factor: usize, worker: &Worker) -> Result<(), SynthesisError> {
397    //     if factor == 1 {
398    //         return Ok(());
399    //     }
400    //     let next_power_of_two = factor.next_power_of_two();
401    //     if factor != next_power_of_two {
402    //         return Err(SynthesisError::Error);
403    //     }
404        
405    //     let new_size = self.coeffs.len() * factor;
406    //     let new_coeffs = vec![F::zero(); new_size];
407    //     let old_coeffs = std::mem::replace(&mut self.coeffs, new_coeffs);
408
409    //     // now we need to interleave the coefficients
410    //     worker.scope(old_coeffs.len(), |scope, chunk| {
411    //         for (old, new) in old_coeffs.chunks(chunk)
412    //                         .zip(self.coeffs.chunks_mut(chunk*factor)) {
413    //             scope.spawn(move |_| {
414    //                 for (j, old_coeff) in old.iter().enumerate() {
415    //                     new[j*factor] = *old_coeff;
416    //                 }
417    //             });
418    //         }
419    //     });
420
421    //     let domain = Domain::new_for_size(new_size as u64)?;
422    //     self.exp = domain.power_of_two as u32;
423    //     let m = domain.size as usize;
424    //     self.omega = domain.generator;
425    //     self.omegainv = self.omega.inverse().unwrap();
426    //     self.minv = F::from_str(&format!("{}", m)).unwrap().inverse().unwrap();
427
428    //     Ok(())
429    // }
430
431    // pub fn extend(&mut self, factor: usize, _worker: &Worker) -> Result<(), SynthesisError> {
432    //     if factor == 1 {
433    //         return Ok(());
434    //     }
435    //     let next_power_of_two = factor.next_power_of_two();
436    //     if factor != next_power_of_two {
437    //         return Err(SynthesisError::Error);
438    //     }
439        
440    //     let new_size = self.coeffs.len() * factor;
441    //     self.coeffs.resize(new_size, F::zero());
442
443    //     Ok(())
444    // }
445
446    #[inline(always)]
447    pub fn break_into_multiples(self, size: usize) -> Result<Vec<Polynomial<F, Coefficients>>, SynthesisError> {
448        let mut coeffs = self.coeffs;
449
450        let (mut num_parts, last_size) = if coeffs.len() % size == 0 {
451            let num_parts = coeffs.len() / size;
452
453            (num_parts, 0)
454        } else {
455            let num_parts = coeffs.len() / size;
456            let last_size = coeffs.len() - num_parts * size;
457
458            (num_parts, last_size)
459        };
460
461        let mut rev_results = Vec::with_capacity(num_parts);
462
463        if last_size != 0 {
464            let drain = coeffs.split_off(coeffs.len() - last_size);
465            rev_results.push(drain);
466            num_parts -= 1;
467        }
468
469        for _ in 0..num_parts {
470            let drain = coeffs.split_off(coeffs.len() - size);
471            rev_results.push(drain);
472        }
473
474        let mut results = Vec::with_capacity(num_parts);
475
476        for c in rev_results.into_iter().rev() {
477            let poly = Polynomial::<F, Coefficients>::from_coeffs(c)?;
478            results.push(poly);
479        }
480
481        Ok(results)
482    }
483
484    #[inline(always)]
485    pub fn lde(self, worker: &Worker, factor: usize) -> Result<Polynomial<F, Values>, SynthesisError> {
486        self.lde_using_multiple_cosets(worker, factor)
487        // self.filtering_lde(worker, factor)
488    }
489
490    #[inline(always)]
491    pub fn coset_lde(self, worker: &Worker, factor: usize) -> Result<Polynomial<F, Values>, SynthesisError> {
492        self.coset_lde_using_multiple_cosets(worker, factor)
493        // self.filtering_coset_lde(worker, factor)
494    }
495
496    pub fn filtering_lde(self, worker: &Worker, factor: usize) -> Result<Polynomial<F, Values>, SynthesisError> {
497        debug_assert!(self.coeffs.len().is_power_of_two());
498        
499        if factor == 1 {
500            return Ok(self.fft(&worker));
501        }
502        assert!(factor.is_power_of_two());
503        let new_size = self.coeffs.len() * factor;
504        let domain = Domain::new_for_size(new_size as u64)?;
505
506        let mut lde = self.coeffs;
507        lde.resize(new_size as usize, F::zero());
508        best_lde(&mut lde, worker, &domain.generator, domain.power_of_two as u32, factor);
509
510        Polynomial::from_values(lde)
511    }
512
513    pub fn lde_using_multiple_cosets_naive(self, worker: &Worker, factor: usize) -> Result<Polynomial<F, Values>, SynthesisError> {
514        debug_assert!(self.coeffs.len().is_power_of_two());
515        
516        if factor == 1 {
517            return Ok(self.fft(&worker));
518        }
519        assert!(factor.is_power_of_two());
520        let new_size = self.coeffs.len() * factor;
521        let domain = Domain::new_for_size(new_size as u64)?;
522
523        let mut results = vec![];
524
525        let mut coset_generator = F::one();
526
527        let one = F::one();
528
529        for _ in 0..factor {
530            let coeffs = self.clone();
531            let lde = if coset_generator == one {
532                coeffs.fft(&worker)
533            } else {
534                coeffs.coset_fft_for_generator(&worker, coset_generator)
535            };
536
537            results.push(lde.into_coeffs());
538            coset_generator.mul_assign(&domain.generator);
539        }
540
541        let mut final_values = vec![F::zero(); new_size];
542
543        let results_ref = &results;
544
545        worker.scope(final_values.len(), |scope, chunk| {
546            for (i, v) in final_values.chunks_mut(chunk).enumerate() {
547                scope.spawn(move |_| {
548                    let mut idx = i*chunk;
549                    for v in v.iter_mut() {
550                        let coset_idx = idx % factor;
551                        let element_idx = idx / factor; 
552                        *v = results_ref[coset_idx][element_idx];
553
554                        idx += 1;
555                    }
556                });
557            }
558        });
559
560        Polynomial::from_values(final_values)
561    }
562
563    pub fn lde_using_multiple_cosets(self, worker: &Worker, factor: usize) -> Result<Polynomial<F, Values>, SynthesisError> {
564        debug_assert!(self.coeffs.len().is_power_of_two());
565        
566        if factor == 1 {
567            return Ok(self.fft(&worker));
568        }
569
570        let num_cpus = worker.cpus;
571        let num_cpus_hint = if num_cpus <= factor {
572            Some(1)
573        } else {
574            let threads_per_coset = factor / num_cpus;
575            // TODO: it's not immediately clean if having more threads than (virtual) cores benefits
576            // over underutilization of some (virtual) cores
577            // let mut threads_per_coset = factor / num_cpus;
578            // if factor % num_cpus != 0 {
579            //     if (threads_per_coset + 1).is_power_of_two() {
580            //         threads_per_coset += 1;
581            //     }
582            // }
583            Some(threads_per_coset)
584        };
585
586        assert!(factor.is_power_of_two());
587        let new_size = self.coeffs.len() * factor;
588        let domain = Domain::<F>::new_for_size(new_size as u64)?;
589
590        let mut results = vec![self.coeffs; factor];
591
592        let coset_omega = domain.generator;
593        let this_domain_omega = self.omega;
594
595        let log_n = self.exp;
596
597        worker.scope(results.len(), |scope, chunk| {
598            for (i, r) in results.chunks_mut(chunk).enumerate() {
599                scope.spawn(move |_| {
600                    let mut coset_generator = coset_omega.pow(&[i as u64]);
601                    for r in r.iter_mut() {
602                        if coset_generator != F::one() {
603                            distribute_powers_serial(&mut r[..], coset_generator);
604                            // distribute_powers(&mut r[..], &worker, coset_generator);
605                        }
606                        best_fft(&mut r[..], &worker, &this_domain_omega, log_n, num_cpus_hint);
607                        coset_generator.mul_assign(&coset_omega);
608                    }
609                });
610            }
611        });
612
613        let mut final_values = vec![F::zero(); new_size];
614
615        let results_ref = &results;
616
617        worker.scope(final_values.len(), |scope, chunk| {
618            for (i, v) in final_values.chunks_mut(chunk).enumerate() {
619                scope.spawn(move |_| {
620                    let mut idx = i*chunk;
621                    for v in v.iter_mut() {
622                        let coset_idx = idx % factor;
623                        let element_idx = idx / factor; 
624                        *v = results_ref[coset_idx][element_idx];
625
626                        idx += 1;
627                    }
628                });
629            }
630        });
631
632        Polynomial::from_values(final_values)
633    }
634
635    pub fn lde_using_multiple_cosets_with_precomputation<P: FftPrecomputations<F>>(
636        self, 
637        worker: &Worker, 
638        factor: usize,
639        precomputed_omegas: &P
640    ) -> Result<Polynomial<F, Values>, SynthesisError> {
641        debug_assert!(self.coeffs.len().is_power_of_two());
642        debug_assert_eq!(self.size(), precomputed_omegas.domain_size());
643        
644        if factor == 1 {
645            return Ok(self.fft(&worker));
646        }
647
648        let num_cpus = worker.cpus;
649        let num_cpus_hint = if num_cpus <= factor {
650            Some(1)
651        } else {
652            let threads_per_coset = factor / num_cpus;
653            // let mut threads_per_coset = factor / num_cpus;
654            // if factor % num_cpus != 0 {
655            //     if (threads_per_coset + 1).is_power_of_two() {
656            //         threads_per_coset += 1;
657            //     }
658            // }
659            Some(threads_per_coset)
660        };
661
662        assert!(factor.is_power_of_two());
663        let new_size = self.coeffs.len() * factor;
664        let domain = Domain::<F>::new_for_size(new_size as u64)?;
665
666        let mut results = vec![self.coeffs; factor];
667
668        let coset_omega = domain.generator;
669        let this_domain_omega = self.omega;
670
671        let log_n = self.exp;
672
673        worker.scope(results.len(), |scope, chunk| {
674            for (i, r) in results.chunks_mut(chunk).enumerate() {
675                scope.spawn(move |_| {
676                    let mut coset_generator = coset_omega.pow(&[i as u64]);
677                    for r in r.iter_mut() {
678                        distribute_powers(&mut r[..], &worker, coset_generator);
679                        with_precomputation::fft::best_fft(&mut r[..], &worker, &this_domain_omega, log_n, num_cpus_hint, precomputed_omegas);
680                        coset_generator.mul_assign(&coset_omega);
681                    }
682                });
683            }
684        });
685
686        let mut final_values = vec![F::zero(); new_size];
687
688        let results_ref = &results;
689
690        worker.scope(final_values.len(), |scope, chunk| {
691            for (i, v) in final_values.chunks_mut(chunk).enumerate() {
692                scope.spawn(move |_| {
693                    let mut idx = i*chunk;
694                    for v in v.iter_mut() {
695                        let coset_idx = idx % factor;
696                        let element_idx = idx / factor; 
697                        *v = results_ref[coset_idx][element_idx];
698
699                        idx += 1;
700                    }
701                });
702            }
703        });
704
705        Polynomial::from_values(final_values)
706    }
707
708    pub fn lde_using_bitreversed_ntt<P: CTPrecomputations<F>>(
709        self, 
710        worker: &Worker, 
711        factor: usize,
712        precomputed_omegas: &P
713    ) -> Result<Polynomial<F, Values>, SynthesisError> {
714        use std::time::Instant;
715        debug_assert!(self.coeffs.len().is_power_of_two());
716        debug_assert_eq!(self.size(), precomputed_omegas.domain_size());
717        
718        if factor == 1 {
719            return Ok(self.fft(&worker));
720        }
721
722        let num_cpus = worker.cpus;
723        let num_cpus_hint = if num_cpus <= factor {
724            Some(1)
725        } else {
726            let threads_per_coset = factor / num_cpus;
727            // let mut threads_per_coset = factor / num_cpus;
728            // if factor % num_cpus != 0 {
729            //     if (threads_per_coset + 1).is_power_of_two() {
730            //         threads_per_coset += 1;
731            //     }
732            // }
733            Some(threads_per_coset)
734        };
735
736        assert!(factor.is_power_of_two());
737        let current_size = self.coeffs.len();
738        let new_size = current_size * factor;
739        let domain = Domain::<F>::new_for_size(new_size as u64)?;
740
741        let mut results = vec![self.coeffs; factor];
742
743        let coset_omega = domain.generator;
744
745        let log_n_u32 = self.exp;
746        let log_n = log_n_u32 as usize;
747
748        // for r in results.iter_mut().skip(1) {
749        //     let mut coset_generator = coset_omega;
750        //     distribute_powers(&mut r[..], &worker, coset_generator);
751        //     coset_generator.mul_assign(&coset_omega);
752        // }
753
754        worker.scope(results.len(), |scope, chunk| {
755            for (i, r) in results.chunks_mut(chunk).enumerate() {
756                scope.spawn(move |_| {
757                    let mut coset_generator = coset_omega.pow(&[i as u64]);
758                    let mut gen_power = i;
759                    for r in r.iter_mut() {
760                        if gen_power != 0 {
761                            distribute_powers_serial(&mut r[..], coset_generator);
762                        }
763                        // distribute_powers(&mut r[..], &worker, coset_generator);
764                        cooley_tukey_ntt::best_ct_ntt(&mut r[..], &worker, log_n_u32, num_cpus_hint, precomputed_omegas);
765
766                        coset_generator.mul_assign(&coset_omega);
767                        gen_power += 1;
768                    }
769                });
770            }
771        });
772
773        // let mut final_values = vec![F::zero(); new_size];
774
775        let mut final_values = Vec::with_capacity(new_size);
776        unsafe {final_values.set_len(new_size)};
777
778        // copy here is more complicated: to have the value in a natural order
779        // one has to use coset_idx to address the result element
780        // and use bit-reversed lookup for an element index
781
782        let results_ref = &results;
783
784        worker.scope(final_values.len(), |scope, chunk| {
785            for (i, v) in final_values.chunks_mut(chunk).enumerate() {
786                scope.spawn(move |_| {
787                    let mut idx = i*chunk;
788                    for v in v.iter_mut() {
789                        let coset_idx = idx % factor;
790                        let element_idx = idx / factor;
791                        let element_idx = cooley_tukey_ntt::bitreverse(element_idx, log_n); 
792                        *v = results_ref[coset_idx][element_idx];
793
794                        idx += 1;
795                    }
796                });
797            }
798        });
799
800        // let res_ptr = &mut final_values[..] as *mut [F];
801
802        // let factor_log_n = crate::plonk::commitments::transparent::utils::log2_floor(factor);
803
804        //  worker.scope(results.len(), |scope, chunk| {
805        //     for (chunk_idx, r) in results.chunks(chunk).enumerate() {
806        //         let res = unsafe {&mut *res_ptr};
807        //         scope.spawn(move |_| {
808        //             // elements from the coset i should be on the places 
809        //             // of sequence i, i + lde_factor, i + 2*lde_factor, ...
810        //             let mut coset_idx = chunk_idx * chunk;
811        //             for r in r.iter() {
812        //                 for (element_idx, el) in r.iter().enumerate() {
813        //                     let write_to = (cooley_tukey_ntt::bitreverse(element_idx, log_n) << factor_log_n) | coset_idx; 
814        //                     res[write_to] = *el;
815        //                 }
816
817        //                 coset_idx += 1;
818        //             }
819        //         });
820        //     }
821        // });
822
823        Polynomial::from_values(final_values)
824    }
825
826
827    // pub fn lde_using_bitreversed_ntt_no_allocations_lowest_bits_reversed<P: CTPrecomputations<F>>(
828    //     self, 
829    //     worker: &Worker, 
830    //     factor: usize,
831    //     precomputed_omegas: &P
832    // ) -> Result<Polynomial<F, Values>, SynthesisError> {
833    //     debug_assert!(self.coeffs.len().is_power_of_two());
834    //     debug_assert_eq!(self.size(), precomputed_omegas.domain_size());
835        
836    //     if factor == 1 {
837    //         return Ok(self.fft(&worker));
838    //     }
839
840    //     let num_cpus = worker.cpus;
841    //     let num_cpus_hint = if num_cpus <= factor {
842    //         Some(1)
843    //     } else {
844    //         let threads_per_coset = factor / num_cpus;
845    //         // let mut threads_per_coset = factor / num_cpus;
846    //         // if factor % num_cpus != 0 {
847    //         //     if (threads_per_coset + 1).is_power_of_two() {
848    //         //         threads_per_coset += 1;
849    //         //     }
850    //         // }
851    //         Some(threads_per_coset)
852    //     };
853
854    //     assert!(factor.is_power_of_two());
855    //     let current_size = self.coeffs.len();
856    //     let new_size = self.coeffs.len() * factor;
857    //     let domain = Domain::<F>::new_for_size(new_size as u64)?;
858
859    //     // let mut results = vec![self.coeffs.clone(); factor];
860
861    //     let mut result = Vec::with_capacity(new_size);
862    //     unsafe { result.set_len(new_size)};
863
864    //     let r = &mut result[..] as *mut [F];
865
866    //     let coset_omega = domain.generator;
867
868    //     let log_n = self.exp;
869
870    //     let range: Vec<usize> = (0..factor).collect();
871
872    //     let self_coeffs_ref = &self.coeffs;
873
874    //     // copy
875
876    //     worker.scope(range.len(), |scope, chunk| {
877    //         for coset_idx in range.chunks(chunk) {
878    //             let r = unsafe {&mut *r};
879    //             scope.spawn(move |_| {
880    //                 for coset_idx in coset_idx.iter() {
881    //                     let start = current_size * coset_idx;
882    //                     let end = start + current_size;
883    //                     let copy_start_pointer: *mut F = r[start..end].as_mut_ptr();
884                        
885    //                     unsafe { std::ptr::copy_nonoverlapping(self_coeffs_ref.as_ptr(), copy_start_pointer, current_size) };
886    //                 }
887    //             });
888    //         }
889    //     });
890
891    //     // let mut coset_generator = F::one();
892    //     // for _ in 0..factor {
893    //     //     result.extend_from_slice(&self.coeffs);
894    //     //     // if coset_idx != 0 {
895    //     //     //     let start = coset_idx * current_size;
896    //     //     //     let end = start + current_size;
897    //     //     //     distribute_powers(&mut result[start..end], &worker, coset_generator);
898    //     //     //     // cooley_tukey_ntt::best_ct_ntt(&mut result[start..end], &worker, log_n, Some(log_num_cpus as usize), precomputed_omegas);
899    //     //     // }
900    //     //     // coset_generator.mul_assign(&coset_omega);
901    //     // }
902    //     // println!("Copying taken {:?}", start.elapsed());
903
904
905
906    //     // for coset_idx in 0..factor {
907    //     //     result.extend_from_slice(&self.coeffs);
908    //     //     if coset_idx != 0 {
909    //     //         let start = coset_idx * current_size;
910    //     //         let end = start + current_size;
911    //     //         distribute_powers(&mut result[start..end], &worker, coset_generator);
912    //     //     }
913    //     //     coset_generator.mul_assign(&coset_omega);
914    //     // }
915
916    //     // for r in results.iter_mut().skip(1) {
917    //     //     let mut coset_generator = coset_omega;
918    //     //     distribute_powers(&mut r[..], &worker, coset_generator);
919    //     //     coset_generator.mul_assign(&coset_omega);
920    //     // }
921
922    //     // let start = Instant::now();
923
924    //     let to_spawn = worker.cpus;
925
926    //     let chunk = Worker::chunk_size_for_num_spawned_threads(factor, to_spawn);
927
928    //     worker.scope(0, |scope, _| {
929    //         for thread_id in 0..to_spawn {
930    //             let r = unsafe {&mut *r};
931    //             scope.spawn(move |_| {
932    //                 let start = thread_id * chunk;
933    //                 let end = if start + chunk <= factor {
934    //                     start + chunk
935    //                 } else {
936    //                     factor
937    //                 };
938    //                 let mut gen_power = start;
939    //                 let mut coset_generator = coset_omega.pow(&[start as u64]);
940    //                 for i in start..end {
941    //                     let from = current_size * i;
942    //                     let to = from + current_size;
943    //                     if gen_power != 0 {
944    //                         distribute_powers_serial(&mut r[from..to], coset_generator);
945    //                     }
946    //                     cooley_tukey_ntt::best_ct_ntt(&mut r[from..to], &worker, log_n, num_cpus_hint, precomputed_omegas);
947    //                     coset_generator.mul_assign(&coset_omega);
948    //                     gen_power += 1;
949    //                 }
950    //             });
951    //         }
952    //     });
953
954    //     // println!("NTT taken {:?}", start.elapsed());
955
956    //     // let start = Instant::now();
957
958    //     // let mut final_values = vec![F::zero(); new_size];
959
960    //     // println!("Allocation of result taken {:?}", start.elapsed());
961
962    //     // let results_ref = &results;
963
964    //     // copy here is more complicated: to have the value in a natural order
965    //     // one has to use coset_idx to address the result element
966    //     // and use bit-reversed lookup for an element index
967
968    //     // let log_n = log_n as usize;
969
970    //     // let start = Instant::now();
971
972    //     // let total_len = result.len();
973
974    //     // let chunk = Worker::chunk_size_for_num_spawned_threads(total_len, to_spawn);
975
976    //     // let lower_bits_mask = (1 << log_n) - 1;
977
978    //     // let higher_bits_mask = !lower_bits_mask;
979
980
981    //     // worker.scope(0, |scope, _| {
982    //     //     for thread_id in 0..to_spawn {
983    //     //         let r = unsafe {&mut *r};
984    //     //         scope.spawn(move |_| {
985    //     //             let start = thread_id * chunk;
986    //     //             let end = if start + chunk <= total_len {
987    //     //                 start + chunk
988    //     //             } else {
989    //     //                 total_len
990    //     //             };
991    //     //             for j in start..end {
992    //     //                 let element_idx = j & lower_bits_mask;
993    //     //                 let coset_mask = j & higher_bits_mask;
994    //     //                 let rj = cooley_tukey_ntt::bitreverse(element_idx, log_n) | coset_mask;
995    //     //                 if j < rj {
996    //     //                     r.swap(j, rj);
997    //     //                 }  
998    //     //             }
999    //     //         });
1000    //     //     }
1001    //     // });
1002
1003    //     // println!("Final copying taken {:?}", start.elapsed());
1004
1005    //     Polynomial::from_values(result)
1006    // }
1007
1008    pub fn coset_filtering_lde(mut self, worker: &Worker, factor: usize) -> Result<Polynomial<F, Values>, SynthesisError> {
1009        debug_assert!(self.coeffs.len().is_power_of_two());
1010        
1011        if factor == 1 {
1012            return Ok(self.coset_fft(&worker));
1013        }
1014        assert!(factor.is_power_of_two());
1015        self.distribute_powers(worker, F::multiplicative_generator());
1016
1017        let new_size = self.coeffs.len() * factor;
1018        let domain = Domain::new_for_size(new_size as u64)?;
1019
1020        let mut lde = self.coeffs;
1021        lde.resize(new_size as usize, F::zero());
1022        best_lde(&mut lde, worker, &domain.generator, domain.power_of_two as u32, factor);
1023
1024        Polynomial::from_values(lde)
1025    }
1026
1027    pub fn coset_lde_using_multiple_cosets_naive(self, worker: &Worker, factor: usize) -> Result<Polynomial<F, Values>, SynthesisError> {
1028        debug_assert!(self.coeffs.len().is_power_of_two());
1029
1030        if factor == 1 {
1031            return Ok(self.coset_fft(&worker));
1032        }
1033        assert!(factor.is_power_of_two());
1034        let new_size = self.coeffs.len() * factor;
1035        let domain = Domain::new_for_size(new_size as u64)?;
1036
1037        let mut results = vec![];
1038
1039        let mut coset_generator = F::multiplicative_generator();
1040
1041        for _ in 0..factor {
1042            let coeffs = self.clone();
1043            let lde = coeffs.coset_fft_for_generator(&worker, coset_generator);
1044
1045            results.push(lde.into_coeffs());
1046            coset_generator.mul_assign(&domain.generator);
1047        }
1048        
1049        let mut final_values = vec![F::zero(); new_size];
1050
1051        let results_ref = &results;
1052
1053        worker.scope(final_values.len(), |scope, chunk| {
1054            for (i, v) in final_values.chunks_mut(chunk).enumerate() {
1055                scope.spawn(move |_| {
1056                    let mut idx = i*chunk;
1057                    for v in v.iter_mut() {
1058                        let coset_idx = idx % factor;
1059                        let element_idx = idx / factor; 
1060                        *v = results_ref[coset_idx][element_idx];
1061
1062                        idx += 1;
1063                    }
1064                });
1065            }
1066        });
1067
1068
1069        Polynomial::from_values(final_values)
1070    }
1071
1072    pub fn coset_lde_using_multiple_cosets(self, worker: &Worker, factor: usize) -> Result<Polynomial<F, Values>, SynthesisError> {
1073        debug_assert!(self.coeffs.len().is_power_of_two());
1074        
1075        if factor == 1 {
1076            return Ok(self.coset_fft(&worker));
1077        }
1078
1079        let num_cpus = worker.cpus;
1080        let num_cpus_hint = if num_cpus <= factor {
1081            Some(1)
1082        } else {
1083            let threads_per_coset = factor / num_cpus;
1084            // let mut threads_per_coset = factor / num_cpus;
1085            // if factor % num_cpus != 0 {
1086            //     if (threads_per_coset + 1).is_power_of_two() {
1087            //         threads_per_coset += 1;
1088            //     }
1089            // }
1090            Some(threads_per_coset)
1091        };
1092
1093        assert!(factor.is_power_of_two());
1094        let new_size = self.coeffs.len() * factor;
1095        let domain = Domain::<F>::new_for_size(new_size as u64)?;
1096
1097        let mut results = vec![self.coeffs; factor];
1098
1099        let coset_omega = domain.generator;
1100        let this_domain_omega = self.omega;
1101
1102        let log_n = self.exp;
1103
1104        worker.scope(results.len(), |scope, chunk| {
1105            for (i, r) in results.chunks_mut(chunk).enumerate() {
1106                scope.spawn(move |_| {
1107                    let mut coset_generator = coset_omega.pow(&[i as u64]);
1108                    coset_generator.mul_assign(&F::multiplicative_generator());
1109                    for r in r.iter_mut() {
1110                        distribute_powers_serial(&mut r[..], coset_generator);
1111                        // distribute_powers(&mut r[..], &worker, coset_generator);
1112                        best_fft(&mut r[..], &worker, &this_domain_omega, log_n, num_cpus_hint);
1113                        coset_generator.mul_assign(&coset_omega);
1114                    }
1115                });
1116            }
1117        });
1118
1119        let mut final_values = vec![F::zero(); new_size];
1120
1121        let results_ref = &results;
1122
1123        worker.scope(final_values.len(), |scope, chunk| {
1124            for (i, v) in final_values.chunks_mut(chunk).enumerate() {
1125                scope.spawn(move |_| {
1126                    let mut idx = i*chunk;
1127                    for v in v.iter_mut() {
1128                        let coset_idx = idx % factor;
1129                        let element_idx = idx / factor; 
1130                        *v = results_ref[coset_idx][element_idx];
1131
1132                        idx += 1;
1133                    }
1134                });
1135            }
1136        });
1137
1138        Polynomial::from_values(final_values)
1139    }
1140
1141    pub fn fft(mut self, worker: &Worker) -> Polynomial<F, Values>
1142    {
1143        debug_assert!(self.coeffs.len().is_power_of_two());
1144        best_fft(&mut self.coeffs, worker, &self.omega, self.exp, None);
1145
1146        Polynomial::<F, Values> {
1147            coeffs: self.coeffs,
1148            exp: self.exp,
1149            omega: self.omega,
1150            omegainv: self.omegainv,
1151            geninv: self.geninv,
1152            minv: self.minv,
1153            _marker: std::marker::PhantomData
1154        }
1155    }
1156
1157    pub fn coset_fft(mut self, worker: &Worker) -> Polynomial<F, Values>
1158    {
1159        debug_assert!(self.coeffs.len().is_power_of_two());
1160        self.distribute_powers(worker, F::multiplicative_generator());
1161        self.fft(worker)
1162    }
1163
1164    pub fn coset_fft_for_generator(mut self, worker: &Worker, gen: F) -> Polynomial<F, Values>
1165    {
1166        debug_assert!(self.coeffs.len().is_power_of_two());
1167        self.distribute_powers(worker, gen);
1168        self.fft(worker)
1169    }
1170
1171    pub fn add_assign(&mut self, worker: &Worker, other: &Polynomial<F, Coefficients>) {
1172        assert!(self.coeffs.len() >= other.coeffs.len());
1173
1174        worker.scope(other.coeffs.len(), |scope, chunk| {
1175            for (a, b) in self.coeffs.chunks_mut(chunk).zip(other.coeffs.chunks(chunk)) {
1176                scope.spawn(move |_| {
1177                    for (a, b) in a.iter_mut().zip(b.iter()) {
1178                        a.add_assign(&b);
1179                    }
1180                });
1181            }
1182        });
1183    }
1184
1185    pub fn add_assign_scaled(&mut self, worker: &Worker, other: &Polynomial<F, Coefficients>, scaling: &F) {
1186        assert!(self.coeffs.len() >= other.coeffs.len());
1187
1188        worker.scope(other.coeffs.len(), |scope, chunk| {
1189            for (a, b) in self.coeffs.chunks_mut(chunk).zip(other.coeffs.chunks(chunk)) {
1190                scope.spawn(move |_| {
1191                    for (a, b) in a.iter_mut().zip(b.iter()) {
1192                        let mut tmp = *b;
1193                        tmp.mul_assign(&scaling);
1194                        a.add_assign(&tmp);
1195                    }
1196                });
1197            }
1198        });
1199    }
1200
1201    /// Perform O(n) subtraction of one polynomial from another in the domain.
1202    pub fn sub_assign(&mut self, worker: &Worker, other: &Polynomial<F, Coefficients>) {
1203        assert!(self.coeffs.len() >= other.coeffs.len());
1204
1205        worker.scope(other.coeffs.len(), |scope, chunk| {
1206            for (a, b) in self.coeffs.chunks_mut(chunk).zip(other.coeffs.chunks(chunk)) {
1207                scope.spawn(move |_| {
1208                    for (a, b) in a.iter_mut().zip(b.iter()) {
1209                        a.sub_assign(&b);
1210                    }
1211                });
1212            }
1213        });
1214    }
1215
1216    pub fn sub_assign_scaled(&mut self, worker: &Worker, other: &Polynomial<F, Coefficients>, scaling: &F) {
1217        assert!(self.coeffs.len() >= other.coeffs.len());
1218
1219        worker.scope(other.coeffs.len(), |scope, chunk| {
1220            for (a, b) in self.coeffs.chunks_mut(chunk).zip(other.coeffs.chunks(chunk)) {
1221                scope.spawn(move |_| {
1222                    for (a, b) in a.iter_mut().zip(b.iter()) {
1223                        let mut tmp = *b;
1224                        tmp.mul_assign(&scaling);
1225                        a.sub_assign(&tmp);
1226                    }
1227                });
1228            }
1229        });
1230    }
1231
1232    pub fn evaluate_at(&self, worker: &Worker, g: F) -> F {
1233        let num_threads = worker.get_num_spawned_threads(self.coeffs.len());
1234        let mut subvalues = vec![F::zero(); num_threads];
1235
1236        worker.scope(self.coeffs.len(), |scope, chunk| {
1237            for (i, (a, s)) in self.coeffs.chunks(chunk)
1238                        .zip(subvalues.chunks_mut(1))
1239                        .enumerate() {
1240                scope.spawn(move |_| {
1241                    let mut x = g.pow([(i*chunk) as u64]);
1242                    for a in a.iter() {
1243                        let mut value = x;
1244                        value.mul_assign(&a);
1245                        s[0].add_assign(&value);
1246                        x.mul_assign(&g);
1247                    }
1248                });
1249            }
1250        });
1251
1252        let mut result = F::zero();
1253        for v in subvalues.iter() {
1254            result.add_assign(&v);
1255        }
1256
1257        result
1258    }
1259}
1260
1261
1262impl<F: PrimeField> Polynomial<F, Values> {
1263    pub fn new_for_size(size: usize) -> Result<Polynomial<F, Values>, SynthesisError> {
1264        let coeffs = vec![F::zero(); size];
1265
1266        Self::from_values(coeffs)
1267    }
1268
1269    pub fn from_values(mut values: Vec<F>) -> Result<Polynomial<F, Values>, SynthesisError>
1270    {
1271        let coeffs_len = values.len();
1272
1273        let domain = Domain::new_for_size(coeffs_len as u64)?;
1274        let exp = domain.power_of_two as u32;
1275        let m = domain.size as usize;
1276        let omega = domain.generator;
1277
1278        values.resize(m, F::zero());
1279
1280        Ok(Polynomial::<F, Values> {
1281            coeffs: values,
1282            exp: exp,
1283            omega: omega,
1284            omegainv: omega.inverse().unwrap(),
1285            geninv: F::multiplicative_generator().inverse().unwrap(),
1286            minv: F::from_str(&format!("{}", m)).unwrap().inverse().unwrap(),
1287            _marker: std::marker::PhantomData
1288        })
1289    }
1290
1291    pub fn from_values_unpadded(values: Vec<F>) -> Result<Polynomial<F, Values>, SynthesisError>
1292    {
1293        let coeffs_len = values.len();
1294
1295        let domain = Domain::new_for_size(coeffs_len as u64)?;
1296        let exp = domain.power_of_two as u32;
1297        let m = domain.size as usize;
1298        let omega = domain.generator;
1299
1300        Ok(Polynomial::<F, Values> {
1301            coeffs: values,
1302            exp: exp,
1303            omega: omega,
1304            omegainv: omega.inverse().unwrap(),
1305            geninv: F::multiplicative_generator().inverse().unwrap(),
1306            minv: F::from_str(&format!("{}", m)).unwrap().inverse().unwrap(),
1307            _marker: std::marker::PhantomData
1308        })
1309    }
1310
1311    // this function should only be used on the values that are bitreverse enumerated
1312    #[track_caller]
1313    pub fn clone_subset_assuming_bitreversed(
1314        &self, 
1315        subset_factor: usize,
1316    ) -> Result<Polynomial<F, Values>, SynthesisError> {
1317        if subset_factor == 1 {
1318            return Ok(self.clone());
1319        }
1320
1321        assert!(subset_factor.is_power_of_two());
1322        
1323        let current_size = self.coeffs.len();
1324        let new_size = current_size / subset_factor;
1325
1326        let mut result = Vec::with_capacity(new_size);
1327        unsafe { result.set_len(new_size)};
1328
1329        // copy elements. If factor is 2 then non-reversed we would output only elements that are == 0 mod 2
1330        // If factor is 2 and we are bit-reversed - we need to only output first half of the coefficients
1331        // If factor is 4 then we need to output only the first 4th part
1332        // if factor is 8 - only the first 8th part
1333
1334        let start = 0;
1335        let end = new_size;
1336
1337        result.copy_from_slice(&self.coeffs[start..end]);
1338        
1339        // unsafe { result.set_len(new_size)};
1340        // let copy_to_start_pointer: *mut F = result[..].as_mut_ptr();
1341        // let copy_from_start_pointer: *const F = self.coeffs[start..end].as_ptr();
1342                        
1343        // unsafe { std::ptr::copy_nonoverlapping(copy_from_start_pointer, copy_to_start_pointer, new_size) };
1344
1345        Polynomial::from_values(result)
1346    }
1347
1348    #[track_caller]
1349    pub fn pow(&mut self, worker: &Worker, exp: u64)
1350    {
1351        if exp == 2 {
1352            return self.square(&worker);
1353        }
1354        worker.scope(self.coeffs.len(), |scope, chunk| {
1355            for v in self.coeffs.chunks_mut(chunk) {
1356                scope.spawn(move |_| {
1357                    for v in v.iter_mut() {
1358                        *v = v.pow([exp]);
1359                    }
1360                });
1361            }
1362        });
1363    }
1364
1365    #[track_caller]
1366    pub fn square(&mut self, worker: &Worker)
1367    {
1368        worker.scope(self.coeffs.len(), |scope, chunk| {
1369            for v in self.coeffs.chunks_mut(chunk) {
1370                scope.spawn(move |_| {
1371                    for v in v.iter_mut() {
1372                        v.square();
1373                    }
1374                });
1375            }
1376        });
1377    }
1378
1379    pub fn ifft(mut self, worker: &Worker) -> Polynomial<F, Coefficients>
1380    {
1381        debug_assert!(self.coeffs.len().is_power_of_two());
1382        best_fft(&mut self.coeffs, worker, &self.omegainv, self.exp, None);
1383
1384        worker.scope(self.coeffs.len(), |scope, chunk| {
1385            let minv = self.minv;
1386
1387            for v in self.coeffs.chunks_mut(chunk) {
1388                scope.spawn(move |_| {
1389                    for v in v {
1390                        v.mul_assign(&minv);
1391                    }
1392                });
1393            }
1394        });
1395
1396        Polynomial::<F, Coefficients> {
1397            coeffs: self.coeffs,
1398            exp: self.exp,
1399            omega: self.omega,
1400            omegainv: self.omegainv,
1401            geninv: self.geninv,
1402            minv: self.minv,
1403            _marker: std::marker::PhantomData
1404        }
1405    }
1406
1407    #[track_caller]
1408    pub fn icoset_fft(self, worker: &Worker) -> Polynomial<F, Coefficients>
1409    {
1410        debug_assert!(self.coeffs.len().is_power_of_two());
1411        let geninv = self.geninv;
1412        let mut res = self.ifft(worker);
1413        res.distribute_powers(worker, geninv);
1414
1415        res
1416    }
1417
1418    #[track_caller]
1419    pub fn icoset_fft_for_generator(self, worker: &Worker, coset_generator: &F) -> Polynomial<F, Coefficients>
1420    {
1421        debug_assert!(self.coeffs.len().is_power_of_two());
1422        let geninv = coset_generator.inverse().expect("must exist");
1423        let mut res = self.ifft(worker);
1424        res.distribute_powers(worker, geninv);
1425
1426        res
1427    }
1428
1429    #[track_caller]
1430    pub fn add_assign(&mut self, worker: &Worker, other: &Polynomial<F, Values>) {
1431        assert_eq!(self.coeffs.len(), other.coeffs.len());
1432
1433        worker.scope(self.coeffs.len(), |scope, chunk| {
1434            for (a, b) in self.coeffs.chunks_mut(chunk).zip(other.coeffs.chunks(chunk)) {
1435                scope.spawn(move |_| {
1436                    for (a, b) in a.iter_mut().zip(b.iter()) {
1437                        a.add_assign(&b);
1438                    }
1439                });
1440            }
1441        });
1442    }
1443
1444    #[track_caller]
1445    pub fn add_constant(&mut self, worker: &Worker, constant: &F) {
1446        worker.scope(self.coeffs.len(), |scope, chunk| {
1447            for a in self.coeffs.chunks_mut(chunk) {
1448                scope.spawn(move |_| {
1449                    for a in a.iter_mut() {
1450                        a.add_assign(&constant);
1451                    }
1452                });
1453            }
1454        });
1455    }
1456
1457    #[track_caller]
1458    pub fn sub_constant(&mut self, worker: &Worker, constant: &F) {
1459        worker.scope(self.coeffs.len(), |scope, chunk| {
1460            for a in self.coeffs.chunks_mut(chunk) {
1461                scope.spawn(move |_| {
1462                    for a in a.iter_mut() {
1463                        a.sub_assign(&constant);
1464                    }
1465                });
1466            }
1467        });
1468    }
1469
1470    #[track_caller]
1471    pub fn rotate(mut self, by: usize) -> Result<Polynomial<F, Values>, SynthesisError>{
1472        let mut values: Vec<_> = self.coeffs.drain(by..).collect();
1473
1474        for c in self.coeffs.into_iter() {
1475            values.push(c);
1476        }
1477
1478        Polynomial::from_values(values)
1479    }
1480
1481    #[track_caller]
1482    pub fn barycentric_evaluate_at(&self, worker: &Worker, g: F) -> Result<F, SynthesisError> {
1483        // use a barycentric formula
1484
1485        // L_i(X) = (omega^i / N) / (X - omega^i) * (X^N - 1)
1486        // we'll have to pay more for batch inversion at some point, but 
1487        // it's still useful
1488        let domain_size = self.size() as u64;
1489        assert!(domain_size.is_power_of_two());
1490
1491        let mut vanishing_at_g = g.pow(&[domain_size]);
1492        vanishing_at_g.sub_assign(&F::one());
1493
1494        // now generate (X - omega^i)
1495        let mut tmp = vec![g; domain_size as usize];
1496
1497        let generator = self.omega;
1498
1499        // constant factor = 1 / ( (1 / N) * (X^N - 1) ) = N / (X^N - 1)
1500
1501        worker.scope(tmp.len(), |scope, chunk| {
1502            for (i, vals) in tmp.chunks_mut(chunk)
1503                        .enumerate() {
1504                scope.spawn(move |_| {
1505                    // let mut one_over_omega_pow = generator_inv.pow([(i*chunk) as u64]);
1506                    // one_over_omega_pow.mul_assign(&constant_factor);
1507                    let mut omega_power = generator.pow([(i*chunk) as u64]);
1508                    for val in vals.iter_mut() {
1509                        val.sub_assign(&omega_power); // (X - omega^i)
1510                        // val.mul_assign(&one_over_omega_pow); // (X - omega^i) * N / (X^N - 1) * omega^(-i), so when we inverse it's valid evaluation
1511                        omega_power.mul_assign(&generator);
1512                        // one_over_omega_pow.mul_assign(&generator_inv);
1513                    }
1514                });
1515            }
1516        });
1517
1518        let mut values = Polynomial::from_values(tmp)?;
1519        values.batch_inversion(&worker)?;
1520
1521        // now multiply by omega^i / N * (X^N - 1) and value for L_i(X)
1522
1523        let mut constant_factor = vanishing_at_g;
1524        constant_factor.mul_assign(&self.minv);
1525
1526        worker.scope(values.size(), |scope, chunk| {
1527            for (i, (vals, coeffs)) in values.as_mut().chunks_mut(chunk)
1528                            .zip(self.coeffs.chunks(chunk))
1529                        .enumerate() {
1530                scope.spawn(move |_| {
1531                    let mut omega_power = generator.pow([(i*chunk) as u64]);
1532                    omega_power.mul_assign(&constant_factor);
1533                    for (val, coeff) in vals.iter_mut().zip(coeffs.iter()) {
1534                        val.mul_assign(&omega_power);
1535                        val.mul_assign(coeff);
1536                        omega_power.mul_assign(&generator);
1537                    }
1538                });
1539            }
1540        });
1541
1542        values.calculate_sum(&worker)
1543    }
1544
1545    #[track_caller]
1546    pub fn barycentric_over_coset_evaluate_at(&self, worker: &Worker, x: F, coset_factor: &F) -> Result<F, SynthesisError> {
1547        // use a barycentric formula
1548        // L_i(x) = \prod_{i != j} (X - x_j) / \prod_{i != j} (x_i - x_j)
1549        // that for a case when x_j = g*omega^j is simplified 
1550
1551        // \prod_{i != j} (X - x_j) = (X^N - g^N) / (X - g * omega^i) 
1552
1553        // \prod_{i != j} (x_i - x_j) = g * (omega)^i / N
1554
1555        // L_i(X) = (g*(omega)^i / N) / (X - g*(omega)^i) * (X^N - g^N)
1556        // we'll have to pay more for batch inversion at some point, but 
1557        // it's still useful
1558        let domain_size = self.size() as u64;
1559        assert!(domain_size.is_power_of_two());
1560
1561        // let normalization_factor = ..pow(&[domain_size]);
1562
1563        let offset = coset_factor.pow(&[domain_size]);
1564
1565        let normalization_factor = offset.inverse().ok_or(SynthesisError::DivisionByZero)?;
1566
1567        let mut vanishing_at_x = x.pow(&[domain_size]);
1568        vanishing_at_x.sub_assign(&offset);
1569
1570        // now generate (X - g*omega^i)
1571        let mut tmp = vec![x; domain_size as usize];
1572
1573        let generator = self.omega;
1574
1575        // constant factor = 1 / ( (1 / N) * (X^N - g^N) ) = N / (X^N - g^N)
1576
1577        worker.scope(tmp.len(), |scope, chunk| {
1578            for (i, vals) in tmp.chunks_mut(chunk)
1579                        .enumerate() {
1580                scope.spawn(move |_| {
1581                    let mut omega_power = generator.pow([(i*chunk) as u64]);
1582                    omega_power.mul_assign(&coset_factor);
1583                    for val in vals.iter_mut() {
1584                        val.sub_assign(&omega_power); // (X - omega^i)
1585                        omega_power.mul_assign(&generator);
1586                    }
1587                });
1588            }
1589        });
1590
1591        let mut values = Polynomial::from_values(tmp)?;
1592        values.batch_inversion(&worker)?;
1593
1594        // now multiply by g*omega^i / N * (X^N - g^N) and value for L_i(X)
1595
1596        let mut constant_factor = vanishing_at_x;
1597        constant_factor.mul_assign(&self.minv);
1598        constant_factor.mul_assign(&coset_factor);
1599        constant_factor.mul_assign(&normalization_factor);
1600
1601        worker.scope(values.size(), |scope, chunk| {
1602            for (i, (vals, coeffs)) in values.as_mut().chunks_mut(chunk)
1603                            .zip(self.coeffs.chunks(chunk))
1604                        .enumerate() {
1605                scope.spawn(move |_| {
1606                    let mut omega_power = generator.pow([(i*chunk) as u64]);
1607                    omega_power.mul_assign(&constant_factor);
1608                    for (val, coeff) in vals.iter_mut().zip(coeffs.iter()) {
1609                        val.mul_assign(&omega_power);
1610                        val.mul_assign(coeff);
1611                        omega_power.mul_assign(&generator);
1612                    }
1613                });
1614            }
1615        });
1616
1617        values.calculate_sum(&worker)
1618    }
1619
1620    // pub fn split_into_even_and_odd_assuming_natural_ordering(
1621    //     self, 
1622    //     worker: &Worker,
1623    // ) -> Result<(Polynomial::<F, Values>, Polynomial::<F, Values>), SynthesisError> {
1624    //     // Classical trick: f(x) = f_even(X^2) + x * f_odd(X^2)
1625    //     assert!(self.coeffs.len().is_power_of_two());
1626    //     assert!(self.coeffs.len() > 1);
1627
1628    //     let result_len = self.coeffs.len() / 2;
1629
1630    //     let mut coeffs = self.coeffs;
1631
1632    //     let mut second: Vec<_> = coeffs.drain(result_len..(2*result_len)).collect();
1633    //     let mut first = coeffs;
1634
1635    //     // f_even(X^2) = (f(x) + f(-x))/ 2
1636    //     // f_odd(X^2) = (f(x) - f(-x))/ 2x
1637
1638    //     worker.scope(first.len(), |scope, chunk| {
1639    //         for (i, (first, second)) in first.chunks_mut(chunk)
1640    //                     .zip(second.chunks_mut(chunk))
1641    //                     .enumerate() {
1642    //             scope.spawn(move |_| {
1643    //                 let mut divisor = generator_inv.pow([(i*chunk) as u64]);
1644    //                 divisor.mul_assign(&constant_factor);
1645    //                 for (f, s) in first.iter_mut().zip(second.iter_mut()) {
1646    //                     let f_at_x = *f;
1647    //                     let f_at_minus_x = *s;
1648
1649    //                     let mut even = f_at_x;
1650    //                     even.add_assign(&f_at_minus_x);
1651    //                     even.mul_assign(&two_inv);
1652
1653    //                     let mut odd = f_at_x;
1654    //                     odd.sub_assign(&f_at_minus_x);
1655    //                     odd.mul_assign(&divisor);
1656
1657    //                     *f = even;
1658    //                     *s = odd;
1659
1660    //                     divisor.mul_assign(&generator_inv);
1661    //                 }
1662    //             });
1663    //         }
1664    //     });
1665
1666    //     let even = Polynomial::from_values(first)?;
1667    //     let odd = Polynomial::from_values(second)?;
1668
1669    //     Ok((even, odd))
1670    // }
1671
1672    #[track_caller]
1673    pub fn split_into_even_and_odd_assuming_natural_ordering(
1674        self, 
1675        worker: &Worker, 
1676        coset_offset: &F
1677    ) -> Result<(Polynomial::<F, Values>, Polynomial::<F, Values>), SynthesisError> {
1678        // Classical trick: f(x) = f_even(X^2) + x * f_odd(X^2)
1679
1680        // f(g) = c_0 + c_1 * g + c_2 * g + c_3 * g
1681        // f(-g) = c_0 - c_1 * g + c_2 * g - c_3 * g
1682        // f_even(g) = c_0 + c_2 * g + ...
1683        // f_odd(g) = c_1 * g + c_3 * g + ...
1684
1685        // f(g*Omega) = c_0 + c_1 * g * Omega + c_2 * g * Omega^2 + c_3 * g * Omega^3
1686        // f(-g*Omega) = c_0 - c_1 * g * Omega + c_2 * g * Omega^2 - c_3 * g * Omega^3
1687
1688        // what should be
1689        // f_even(g*Omega^2) = c_0 + c_2 * g*Omega^2 + ...
1690        // f_odd(g*Omega^2/g) * g*Omega = c_1 * g * Omega + c_3 * g * Omega^3 + ...
1691
1692        // (f(g*Omega) + f(-g*Omega))/2 = c_0 + c_2 * g*Omega^2 + ... - those are values of the even coefficient polynomial at X^2/g
1693        // (f(g*Omega) - f(-g*Omega))/2 / (g * Omega) = c_1 + c_3 * Omega^2 + ... - those are values of the even coefficient polynomial at X^2/g^2
1694
1695
1696        // to make it homogenius (cause we don't care about particular coefficients)
1697        // we make it as 
1698        // (f(g*Omega) + f(-g*Omega))/2 / g = c_0/g + c_2 * Omega^2 - values for some polynomial over (X^2 / g^2)
1699        // (f(g*Omega) - f(-g*Omega))/2 / (g * Omega) = c_1 + c_3 * Omega^2 - values for some polynomial over (X^2 / g^2)
1700        assert!(self.coeffs.len().is_power_of_two());
1701        assert!(self.coeffs.len() > 1);
1702
1703        let result_len = self.coeffs.len() / 2;
1704
1705        let mut coeffs = self.coeffs;
1706
1707        let mut second: Vec<_> = coeffs.drain(result_len..(2*result_len)).collect();
1708        let mut first = coeffs;
1709
1710        let generator_inv = self.omegainv;
1711
1712        let mut two = F::one();
1713        two.double();
1714
1715        let coset_offset_inv = coset_offset.inverse().ok_or(SynthesisError::DivisionByZero)?;
1716
1717        let two_inv = two.inverse().ok_or(SynthesisError::DivisionByZero)?;
1718
1719        let mut constant_factor = two_inv;
1720        constant_factor.mul_assign(&coset_offset_inv);
1721
1722        let divisor_even = two_inv;
1723        // let divisor_even = constant_factor;
1724
1725        // f_even(X^2) = (f(x) + f(-x))/ 2
1726        // f_odd(X^2) = (f(x) - f(-x))/ 2x
1727
1728        worker.scope(first.len(), |scope, chunk| {
1729            for (i, (first, second)) in first.chunks_mut(chunk)
1730                        .zip(second.chunks_mut(chunk))
1731                        .enumerate() {
1732                scope.spawn(move |_| {
1733                    let mut divisor_odd = generator_inv.pow([(i*chunk) as u64]);
1734                    divisor_odd.mul_assign(&constant_factor);
1735                    for (f, s) in first.iter_mut().zip(second.iter_mut()) {
1736                        let f_at_x = *f;
1737                        let f_at_minus_x = *s;
1738
1739                        let mut even = f_at_x;
1740                        even.add_assign(&f_at_minus_x);
1741                        even.mul_assign(&divisor_even);
1742
1743                        let mut odd = f_at_x;
1744                        odd.sub_assign(&f_at_minus_x);
1745                        odd.mul_assign(&divisor_odd);
1746
1747                        *f = even;
1748                        *s = odd;
1749
1750                        divisor_odd.mul_assign(&generator_inv);
1751                    }
1752                });
1753            }
1754        });
1755
1756        let even = Polynomial::from_values(first)?;
1757        let odd = Polynomial::from_values(second)?;
1758
1759        Ok((even, odd))
1760    }
1761
1762    #[track_caller]
1763    pub fn calculate_shifted_grand_product(&self, worker: &Worker) -> Result<Polynomial<F, Values>, SynthesisError> {
1764        let mut result = vec![F::zero(); self.coeffs.len() + 1];
1765        result[0] = F::one();
1766
1767        // let not_shifted_product = self.calculate_grand_product(&worker)?;
1768        // result[1..].copy_from_slice(&not_shifted_product.into_coeffs()[..]);
1769
1770        // Polynomial::from_values_unpadded(result)
1771
1772        let work_chunk = &mut result[1..];
1773        assert!(work_chunk.len() == self.coeffs.len());
1774
1775        let num_threads = worker.get_num_spawned_threads(self.coeffs.len());
1776        let mut subproducts = vec![F::one(); num_threads as usize];
1777
1778        worker.scope(self.coeffs.len(), |scope, chunk| {
1779            for ((g, c), s) in work_chunk.chunks_mut(chunk)
1780                        .zip(self.coeffs.chunks(chunk))
1781                        .zip(subproducts.chunks_mut(1)) {
1782                scope.spawn(move |_| {
1783                    for (g, c) in g.iter_mut()
1784                                    .zip(c.iter()) {
1785                        s[0].mul_assign(&c);
1786                        *g = s[0];
1787                    }
1788                });
1789            }
1790        });
1791
1792        // subproducts are [abc, def, xyz]
1793
1794        // we do not need the last one
1795        subproducts.pop().expect("has at least one value");
1796
1797        let mut tmp = F::one();
1798        for s in subproducts.iter_mut() {
1799            tmp.mul_assign(&s);
1800            *s = tmp;
1801        }
1802
1803        let chunk_len = worker.get_chunk_size(self.coeffs.len());
1804
1805        worker.scope(0, |scope, _| {
1806            for (g, s) in work_chunk[chunk_len..].chunks_mut(chunk_len)
1807                        .zip(subproducts.chunks(1)) {
1808                scope.spawn(move |_| {
1809                    for g in g.iter_mut() {
1810                        g.mul_assign(&s[0]);
1811                    }
1812                });
1813            }
1814        });
1815
1816        Polynomial::from_values(result)
1817    }
1818
1819    #[track_caller]
1820    pub fn calculate_grand_product(&self, worker: &Worker) -> Result<Polynomial<F, Values>, SynthesisError> {
1821        let mut result = vec![F::zero(); self.coeffs.len()];
1822
1823        let num_threads = worker.get_num_spawned_threads(self.coeffs.len());
1824        let mut subproducts = vec![F::one(); num_threads as usize];
1825
1826        worker.scope(self.coeffs.len(), |scope, chunk| {
1827            for ((g, c), s) in result.chunks_mut(chunk)
1828                        .zip(self.coeffs.chunks(chunk))
1829                        .zip(subproducts.chunks_mut(1)) {
1830                scope.spawn(move |_| {
1831                    for (g, c) in g.iter_mut()
1832                                    .zip(c.iter()) {
1833                        s[0].mul_assign(&c);
1834                        *g = s[0];
1835                    }
1836                });
1837            }
1838        });
1839
1840        // subproducts are [abc, def, xyz]
1841
1842        // we do not need the last one
1843        subproducts.pop().expect("has at least one value");
1844
1845        let mut tmp = F::one();
1846        for s in subproducts.iter_mut() {
1847            tmp.mul_assign(&s);
1848            *s = tmp;
1849        }
1850
1851        let chunk_len = worker.get_chunk_size(self.coeffs.len());
1852
1853        worker.scope(0, |scope, _| {
1854            for (g, s) in result[chunk_len..].chunks_mut(chunk_len)
1855                        .zip(subproducts.chunks(1)) {
1856                scope.spawn(move |_| {
1857                    let c = s[0];
1858                    for g in g.iter_mut() {
1859                        g.mul_assign(&c);
1860                    }
1861                });
1862            }
1863        });
1864
1865        Polynomial::from_values_unpadded(result)
1866    }
1867
1868    #[track_caller]
1869    pub fn calculate_grand_product_serial(&self) -> Result<Polynomial<F, Values>, SynthesisError> {
1870        let mut result = Vec::with_capacity(self.coeffs.len());
1871
1872        let mut tmp = F::one();
1873        for c in self.coeffs.iter() {
1874            tmp.mul_assign(&c);
1875            result.push(tmp);
1876        }
1877
1878        Polynomial::from_values_unpadded(result)
1879    }
1880
1881    #[track_caller]
1882    pub fn calculate_sum(&self, worker: &Worker) -> Result<F, SynthesisError> {
1883        let num_threads = worker.get_num_spawned_threads(self.coeffs.len());
1884        let mut subresults = vec![F::zero(); num_threads as usize];
1885
1886        worker.scope(self.coeffs.len(), |scope, chunk| {
1887            for (c, s) in self.coeffs.chunks(chunk)
1888                        .zip(subresults.chunks_mut(1)) {
1889                scope.spawn(move |_| {
1890                    for c in c.iter() {
1891                        s[0].add_assign(&c);
1892                    }
1893                });
1894            }
1895        });
1896
1897        let mut sum = F::zero();
1898
1899        for el in subresults.iter() {
1900            sum.add_assign(&el);
1901        }
1902
1903        Ok(sum)
1904    }
1905
1906    #[track_caller]
1907    pub fn calculate_grand_sum(&self, worker: &Worker) -> Result<(F, Polynomial<F, Values>), SynthesisError> {
1908        // first value is zero, then first element, then first + second, ...
1909        let mut result = vec![F::zero(); self.coeffs.len() + 1];
1910
1911        let num_threads = worker.get_num_spawned_threads(self.coeffs.len());
1912        let mut subsums = vec![F::zero(); num_threads as usize];
1913
1914        worker.scope(self.coeffs.len(), |scope, chunk| {
1915            for ((g, c), s) in result[1..].chunks_mut(chunk)
1916                        .zip(self.coeffs.chunks(chunk))
1917                        .zip(subsums.chunks_mut(1)) {
1918                scope.spawn(move |_| {
1919                    for (g, c) in g.iter_mut()
1920                                    .zip(c.iter()) {
1921                        s[0].add_assign(&c);
1922                        *g = s[0];
1923                    }
1924                });
1925            }
1926        });
1927
1928        // subsums are [a+b+c, d+e+f, x+y+z]
1929
1930        let mut tmp = F::zero();
1931        for s in subsums.iter_mut() {
1932            tmp.add_assign(&s);
1933            *s = tmp;
1934        }
1935
1936        // sum over the full domain is the last element
1937        let domain_sum = subsums.pop().expect("has at least one value");
1938
1939        let chunk_len = worker.get_chunk_size(self.coeffs.len());
1940
1941        worker.scope(0, |scope, _| {
1942            for (g, s) in result[(chunk_len+1)..].chunks_mut(chunk_len)
1943                        .zip(subsums.chunks(1)) {
1944                scope.spawn(move |_| {
1945                    let c = s[0];
1946                    for g in g.iter_mut() {
1947                        g.add_assign(&c);
1948                    }
1949                });
1950            }
1951        });
1952
1953        // let result = result.drain(1..).collect();
1954
1955        let alt_total_sum = result.pop().expect("must pop the last element");
1956
1957        assert_eq!(alt_total_sum, domain_sum);
1958
1959        Ok((domain_sum, Polynomial::from_values_unpadded(result)?))
1960    }
1961
1962    #[track_caller]
1963    pub fn add_assign_scaled(&mut self, worker: &Worker, other: &Polynomial<F, Values>, scaling: &F) {
1964        assert_eq!(self.coeffs.len(), other.coeffs.len());
1965
1966        worker.scope(other.coeffs.len(), |scope, chunk| {
1967            for (a, b) in self.coeffs.chunks_mut(chunk).zip(other.coeffs.chunks(chunk)) {
1968                scope.spawn(move |_| {
1969                    for (a, b) in a.iter_mut().zip(b.iter()) {
1970                        let mut tmp = *b;
1971                        tmp.mul_assign(&scaling);
1972                        a.add_assign(&tmp);
1973                    }
1974                });
1975            }
1976        });
1977    }
1978
1979    /// Perform O(n) subtraction of one polynomial from another in the domain.
1980    #[track_caller]
1981    pub fn sub_assign(&mut self, worker: &Worker, other: &Polynomial<F, Values>) {
1982        assert_eq!(self.coeffs.len(), other.coeffs.len());
1983
1984        worker.scope(self.coeffs.len(), |scope, chunk| {
1985            for (a, b) in self.coeffs.chunks_mut(chunk).zip(other.coeffs.chunks(chunk)) {
1986                scope.spawn(move |_| {
1987                    for (a, b) in a.iter_mut().zip(b.iter()) {
1988                        a.sub_assign(&b);
1989                    }
1990                });
1991            }
1992        });
1993    }
1994
1995    #[track_caller]
1996    pub fn sub_assign_scaled(&mut self, worker: &Worker, other: &Polynomial<F, Values>, scaling: &F) {
1997        assert_eq!(self.coeffs.len(), other.coeffs.len());
1998
1999        worker.scope(other.coeffs.len(), |scope, chunk| {
2000            for (a, b) in self.coeffs.chunks_mut(chunk).zip(other.coeffs.chunks(chunk)) {
2001                scope.spawn(move |_| {
2002                    for (a, b) in a.iter_mut().zip(b.iter()) {
2003                        let mut tmp = *b;
2004                        tmp.mul_assign(&scaling);
2005                        a.sub_assign(&tmp);
2006                    }
2007                });
2008            }
2009        });
2010    }
2011
2012    /// Perform O(n) multiplication of two polynomials in the domain.
2013    #[track_caller]
2014    pub fn mul_assign(&mut self, worker: &Worker, other: &Polynomial<F, Values>) {
2015        assert_eq!(self.coeffs.len(), other.coeffs.len());
2016
2017        worker.scope(self.coeffs.len(), |scope, chunk| {
2018            for (a, b) in self.coeffs.chunks_mut(chunk).zip(other.coeffs.chunks(chunk)) {
2019                scope.spawn(move |_| {
2020                    for (a, b) in a.iter_mut().zip(b.iter()) {
2021                        a.mul_assign(&b);
2022                    }
2023                });
2024            }
2025        });
2026    }
2027
2028    pub fn batch_inversion(&mut self, worker: &Worker) -> Result<(), SynthesisError> {
2029        let num_threads = worker.get_num_spawned_threads(self.coeffs.len());
2030
2031        let mut grand_products = vec![F::one(); self.coeffs.len()];
2032        let mut subproducts = vec![F::one(); num_threads as usize];
2033
2034        worker.scope(self.coeffs.len(), |scope, chunk| {
2035            for ((a, g), s) in self.coeffs.chunks(chunk)
2036                        .zip(grand_products.chunks_mut(chunk))
2037                        .zip(subproducts.chunks_mut(1)) {
2038                scope.spawn(move |_| {
2039                    for (a, g) in a.iter().zip(g.iter_mut()) {
2040                        s[0].mul_assign(&a);
2041                        *g = s[0];
2042                    }
2043                });
2044            }
2045        });
2046
2047        // now coeffs are [a, b, c, d, ..., z]
2048        // grand_products are [a, ab, abc, d, de, def, ...., xyz]
2049        // subproducts are [abc, def, xyz]
2050        // not guaranteed to have equal length
2051
2052        let mut full_grand_product = F::one();
2053        for sub in subproducts.iter() {
2054            full_grand_product.mul_assign(sub);
2055        }
2056
2057        let product_inverse = full_grand_product.inverse().ok_or(SynthesisError::DivisionByZero)?;
2058
2059        // now let's get [abc^-1, def^-1, ..., xyz^-1];
2060        let mut subinverses = vec![F::one(); num_threads];
2061        for (i, s) in subinverses.iter_mut().enumerate() {
2062            let mut tmp = product_inverse;
2063            for (j, p) in subproducts.iter().enumerate() {
2064                if i == j {
2065                    continue;
2066                }
2067                tmp.mul_assign(&p);
2068            }
2069
2070            *s = tmp;
2071        }
2072
2073        worker.scope(self.coeffs.len(), |scope, chunk| {
2074            for ((a, g), s) in self.coeffs.chunks_mut(chunk)
2075                        .zip(grand_products.chunks(chunk))
2076                        .zip(subinverses.chunks_mut(1)) {
2077                scope.spawn(move |_| {
2078                    for (a, g) in a.iter_mut().rev()
2079                                .zip(g.iter().rev().skip(1).chain(Some(F::one()).iter())) {
2080                        // s[0] = abc^-1
2081                        // a = c
2082                        // g = ab
2083                        let tmp = *a; // c
2084                        *a = *g;
2085                        a.mul_assign(&s[0]); // a = ab*(abc^-1) = c^-1
2086                        s[0].mul_assign(&tmp); // s[0] = (ab)^-1
2087                    }
2088                });
2089            }
2090        });
2091
2092        Ok(())
2093    }
2094
2095    #[track_caller]
2096    pub fn pop_last(&mut self) -> Result<F, SynthesisError> {
2097        let last = self.coeffs.pop().ok_or(SynthesisError::AssignmentMissing)?;
2098
2099        Ok(last)
2100    }
2101
2102    #[track_caller]
2103    pub fn clone_shifted_assuming_natural_ordering(&self, by: usize) -> Result<Self, SynthesisError> {
2104        let len = self.coeffs.len();
2105        assert!(by < len);
2106        let mut new_coeffs = vec![F::zero(); self.coeffs.len()];
2107        new_coeffs[..(len - by)].copy_from_slice(&self.coeffs[by..]);
2108        new_coeffs[(len-by)..].copy_from_slice(&self.coeffs[..by]);
2109
2110        Self::from_values(new_coeffs)
2111    }
2112
2113    #[track_caller]
2114    pub fn clone_shifted_assuming_bitreversed(&self, by: usize, worker: &Worker) -> Result<Self, SynthesisError> {
2115        let len = self.coeffs.len();
2116        assert!(by < len);
2117        let mut extended_clone = Vec::with_capacity(len + by);
2118        extended_clone.extend_from_slice(&self.coeffs);
2119        let mut tmp = Self::from_values(extended_clone)?;
2120        tmp.bitreverse_enumeration(&worker);
2121
2122        let mut coeffs = tmp.into_coeffs();
2123        let tmp: Vec<_> = coeffs.drain(..by).collect();
2124        coeffs.extend(tmp);
2125
2126        let mut tmp = Self::from_values(coeffs)?;
2127        tmp.bitreverse_enumeration(&worker);
2128
2129        Ok(tmp)
2130    }
2131}
2132
2133impl<F: PartialTwoBitReductionField> Polynomial<F, Coefficients> {
2134    pub fn bitreversed_lde_using_bitreversed_ntt_with_partial_reduction<P: CTPrecomputations<F>>(
2135        self, 
2136        worker: &Worker, 
2137        factor: usize,
2138        precomputed_omegas: &P,
2139        coset_factor: &F
2140    ) -> Result<Polynomial<F, Values>, SynthesisError> {
2141        debug_assert!(self.coeffs.len().is_power_of_two());
2142        debug_assert_eq!(self.size(), precomputed_omegas.domain_size());
2143        
2144        if factor == 1 {
2145            return Ok(self.fft(&worker));
2146        }
2147
2148        let num_cpus = worker.cpus;
2149        let num_cpus_hint = if num_cpus <= factor {
2150            Some(1)
2151        } else {
2152            let mut threads_per_coset = num_cpus / factor;
2153            if threads_per_coset == 0 {
2154                threads_per_coset = 1;
2155            } else if num_cpus % factor != 0 {
2156                threads_per_coset += 1;
2157            }
2158            // let mut threads_per_coset = factor / num_cpus;
2159            // if factor % num_cpus != 0 {
2160            //     if (threads_per_coset + 1).is_power_of_two() {
2161            //         threads_per_coset += 1;
2162            //     }
2163            // }
2164            Some(threads_per_coset)
2165        };
2166
2167        assert!(factor.is_power_of_two());
2168        let current_size = self.coeffs.len();
2169        let new_size = self.coeffs.len() * factor;
2170        let domain = Domain::<F>::new_for_size(new_size as u64)?;
2171
2172        // let mut results = vec![self.coeffs.clone(); factor];
2173
2174        let mut result = Vec::with_capacity(new_size);
2175        unsafe { result.set_len(new_size)};
2176
2177        let r = &mut result[..] as *mut [F];
2178
2179        let coset_omega = domain.generator;
2180
2181        let log_n = self.exp;
2182
2183        let range: Vec<usize> = (0..factor).collect();
2184
2185        let self_coeffs_ref = &self.coeffs;
2186
2187        // copy
2188
2189        worker.scope(range.len(), |scope, chunk| {
2190            for coset_idx in range.chunks(chunk) {
2191                let r = unsafe {&mut *r};
2192                scope.spawn(move |_| {
2193                    for coset_idx in coset_idx.iter() {
2194                        let start = current_size * coset_idx;
2195                        let end = start + current_size;
2196                        let copy_start_pointer: *mut F = r[start..end].as_mut_ptr();
2197                        
2198                        unsafe { std::ptr::copy_nonoverlapping(self_coeffs_ref.as_ptr(), copy_start_pointer, current_size) };
2199                    }
2200                });
2201            }
2202        });
2203
2204        let to_spawn = factor;
2205        let coset_size = current_size;
2206
2207        use crate::plonk::commitments::transparent::utils::log2_floor;
2208
2209        let factor_log = log2_floor(factor) as usize;
2210
2211        // let chunk = Worker::chunk_size_for_num_spawned_threads(factor, to_spawn);
2212
2213        // Each coset will produce values at specific indexes only, e.g
2214        // coset factor of omega^0 = 1 will produce elements that are only at places == 0 mod 16
2215        // coset factor of omega^1 will produce elements that are only at places == 1 mod 16
2216        // etc. We expect the output to be bitreversed, so 
2217        // elements for coset factor of omega^0 = 1 will need to be placed first (00 top bits, bitreversed 00)
2218        // elements for coset factor of omega^1 will need to be placed after the first half (10 top bits, bitreversed 01)
2219
2220        worker.scope(0, |scope, _| {
2221            for coset_idx in 0..to_spawn {
2222                let r = unsafe {&mut *r};
2223                scope.spawn(move |_| {
2224                    let from = coset_size * coset_idx;
2225                    let to = from + coset_size;
2226                    let one = F::one();
2227                    let bitreversed_power = cooley_tukey_ntt::bitreverse(coset_idx, factor_log); 
2228                    let mut coset_generator = coset_omega.pow(&[bitreversed_power as u64]);
2229                    coset_generator.mul_assign(&coset_factor);
2230                    if coset_generator != one {
2231                        distribute_powers_with_num_cpus(&mut r[from..to], &worker, coset_generator, num_cpus_hint.expect("is some"));
2232                    }
2233                    partial_reduction::best_ct_ntt_partial_reduction(&mut r[from..to], &worker, log_n, num_cpus_hint, precomputed_omegas);
2234                });
2235            }
2236        });
2237
2238        Polynomial::from_values(result)
2239    }
2240}
2241
2242impl<F: PartialTwoBitReductionField> Polynomial<F, Values> {
2243    pub fn ifft_using_bitreversed_ntt_with_partial_reduction<P: CTPrecomputations<F>>(
2244        self, 
2245        worker: &Worker, 
2246        precomputed_omegas: &P,
2247        coset_generator: &F
2248    ) -> Result<Polynomial<F, Coefficients>, SynthesisError> {
2249        if self.coeffs.len() <= worker.cpus * 4 {
2250            return Ok(self.ifft(&worker));
2251        }
2252
2253        let mut coeffs: Vec<_> = self.coeffs;
2254        let exp = self.exp;
2255        cooley_tukey_ntt::partial_reduction::best_ct_ntt_partial_reduction(&mut coeffs, worker, exp, Some(worker.cpus), precomputed_omegas);
2256
2257        let mut this = Polynomial::from_coeffs(coeffs)?;
2258        
2259        this.bitreverse_enumeration(&worker);
2260
2261        let geninv = coset_generator.inverse().expect("must exist");
2262
2263        worker.scope(this.coeffs.len(), |scope, chunk| {
2264            let minv = this.minv;
2265
2266            for v in this.coeffs.chunks_mut(chunk) {
2267                scope.spawn(move |_| {
2268                    for v in v {
2269                        v.mul_assign(&minv);
2270                    }
2271                });
2272            }
2273        });
2274
2275        if geninv != F::one() {
2276            this.distribute_powers(&worker, geninv);
2277        }
2278
2279        Ok(this)
2280    }
2281}
2282
2283impl<F: PrimeField> Polynomial<F, Values> {
2284    /// taken in natural enumeration
2285    /// outputs in natural enumeration
2286    pub fn ifft_using_bitreversed_ntt<P: CTPrecomputations<F>>(
2287        self, 
2288        worker: &Worker, 
2289        precomputed_omegas: &P,
2290        coset_generator: &F
2291    ) -> Result<Polynomial<F, Coefficients>, SynthesisError> {
2292        if self.coeffs.len() <= worker.cpus * 4 {
2293            return Ok(self.ifft(&worker));
2294        }
2295
2296        let mut coeffs: Vec<_> = self.coeffs;
2297        let exp = self.exp;
2298        cooley_tukey_ntt::best_ct_ntt(&mut coeffs, worker, exp, Some(worker.cpus), precomputed_omegas);
2299        let mut this = Polynomial::from_coeffs(coeffs)?;
2300        
2301        this.bitreverse_enumeration(&worker);
2302
2303        let geninv = coset_generator.inverse().expect("must exist");
2304
2305        worker.scope(this.coeffs.len(), |scope, chunk| {
2306            let minv = this.minv;
2307
2308            for v in this.coeffs.chunks_mut(chunk) {
2309                scope.spawn(move |_| {
2310                    for v in v {
2311                        v.mul_assign(&minv);
2312                    }
2313                });
2314            }
2315        });
2316
2317        if geninv != F::one() {
2318            this.distribute_powers(&worker, geninv);
2319        }
2320
2321        Ok(this)
2322    }
2323}
2324
2325impl<F: PrimeField> Polynomial<F, Coefficients> {
2326    pub fn bitreversed_lde_using_bitreversed_ntt<P: CTPrecomputations<F>>(
2327        self, 
2328        worker: &Worker, 
2329        factor: usize,
2330        precomputed_omegas: &P,
2331        coset_factor: &F
2332    ) -> Result<Polynomial<F, Values>, SynthesisError> {
2333        debug_assert!(self.coeffs.len().is_power_of_two());
2334        debug_assert_eq!(self.size(), precomputed_omegas.domain_size());
2335        
2336        if factor == 1 {
2337            return Ok(self.fft(&worker));
2338        }
2339
2340        let num_cpus = worker.cpus;
2341        let num_cpus_hint = if num_cpus <= factor {
2342            Some(1)
2343        } else {
2344            let mut threads_per_coset = num_cpus / factor;
2345            if threads_per_coset == 0 {
2346                threads_per_coset = 1;
2347            } else if num_cpus % factor != 0 {
2348                threads_per_coset += 1;
2349            }
2350            // let mut threads_per_coset = factor / num_cpus;
2351            // if factor % num_cpus != 0 {
2352            //     if (threads_per_coset + 1).is_power_of_two() {
2353            //         threads_per_coset += 1;
2354            //     }
2355            // }
2356            Some(threads_per_coset)
2357        };
2358
2359        assert!(factor.is_power_of_two());
2360        let current_size = self.coeffs.len();
2361        let new_size = self.coeffs.len() * factor;
2362        let domain = Domain::<F>::new_for_size(new_size as u64)?;
2363
2364        // let mut results = vec![self.coeffs.clone(); factor];
2365
2366        let mut result = Vec::with_capacity(new_size);
2367        unsafe { result.set_len(new_size)};
2368
2369        let r = &mut result[..] as *mut [F];
2370
2371        let coset_omega = domain.generator;
2372
2373        let log_n = self.exp;
2374
2375        let range: Vec<usize> = (0..factor).collect();
2376
2377        let self_coeffs_ref = &self.coeffs;
2378
2379        // copy
2380
2381        worker.scope(range.len(), |scope, chunk| {
2382            for coset_idx in range.chunks(chunk) {
2383                let r = unsafe {&mut *r};
2384                scope.spawn(move |_| {
2385                    for coset_idx in coset_idx.iter() {
2386                        let start = current_size * coset_idx;
2387                        let end = start + current_size;
2388                        let copy_start_pointer: *mut F = r[start..end].as_mut_ptr();
2389                        
2390                        unsafe { std::ptr::copy_nonoverlapping(self_coeffs_ref.as_ptr(), copy_start_pointer, current_size) };
2391                    }
2392                });
2393            }
2394        });
2395
2396        let to_spawn = factor;
2397        let coset_size = current_size;
2398
2399        use crate::plonk::commitments::transparent::utils::log2_floor;
2400
2401        let factor_log = log2_floor(factor) as usize;
2402
2403        // let chunk = Worker::chunk_size_for_num_spawned_threads(factor, to_spawn);
2404
2405        // Each coset will produce values at specific indexes only, e.g
2406        // coset factor of omega^0 = 1 will produce elements that are only at places == 0 mod 16
2407        // coset factor of omega^1 will produce elements that are only at places == 1 mod 16
2408        // etc. We expect the output to be bitreversed, so 
2409        // elements for coset factor of omega^0 = 1 will need to be placed first (00 top bits, bitreversed 00)
2410        // elements for coset factor of omega^1 will need to be placed after the first half (10 top bits, bitreversed 01)
2411
2412        worker.scope(0, |scope, _| {
2413            for coset_idx in 0..to_spawn {
2414                let r = unsafe {&mut *r};
2415                scope.spawn(move |_| {
2416                    let from = coset_size * coset_idx;
2417                    let to = from + coset_size;
2418                    let one = F::one();
2419                    let bitreversed_power = cooley_tukey_ntt::bitreverse(coset_idx, factor_log); 
2420                    let mut coset_generator = coset_omega.pow(&[bitreversed_power as u64]);
2421                    coset_generator.mul_assign(&coset_factor);
2422                    if coset_generator != one {
2423                        distribute_powers_with_num_cpus(&mut r[from..to], &worker, coset_generator, num_cpus_hint.expect("is some"));
2424                    }
2425                    cooley_tukey_ntt::best_ct_ntt(&mut r[from..to], &worker, log_n, num_cpus_hint, precomputed_omegas);
2426                });
2427            }
2428        });
2429
2430        Polynomial::from_values(result)
2431    }
2432
2433    /// taken in natural enumeration
2434    /// outputs in natural enumeration
2435    pub fn fft_using_bitreversed_ntt<P: CTPrecomputations<F>>(
2436        self, 
2437        worker: &Worker, 
2438        precomputed_omegas: &P,
2439        coset_generator: &F
2440    ) -> Result<Polynomial<F, Values>, SynthesisError> {
2441        if self.coeffs.len() <= worker.cpus * 4 {
2442            return Ok(self.coset_fft_for_generator(&worker, *coset_generator));
2443        }
2444
2445        let mut this = self;
2446        if coset_generator != &F::one() {
2447            this.distribute_powers(&worker, *coset_generator);
2448        }
2449
2450        let mut coeffs: Vec<_> = this.coeffs;
2451        let exp = this.exp;
2452        cooley_tukey_ntt::best_ct_ntt(&mut coeffs, worker, exp, Some(worker.cpus), precomputed_omegas);
2453        let mut this = Polynomial::from_values(coeffs)?;
2454        
2455        this.bitreverse_enumeration(&worker);
2456
2457        Ok(this)
2458    }
2459
2460    /// taken in natural enumeration
2461    /// outputs in natural enumeration
2462    pub fn fft_using_bitreversed_ntt_output_bitreversed<P: CTPrecomputations<F>>(
2463        self, 
2464        worker: &Worker, 
2465        precomputed_omegas: &P,
2466        coset_generator: &F
2467    ) -> Result<Polynomial<F, Values>, SynthesisError> {
2468        if self.coeffs.len() <= worker.cpus * 4 {
2469            return Ok(self.coset_fft_for_generator(&worker, *coset_generator));
2470        }
2471
2472        let mut this = self;
2473        if coset_generator != &F::one() {
2474            this.distribute_powers(&worker, *coset_generator);
2475        }
2476
2477        let mut coeffs: Vec<_> = this.coeffs;
2478        let exp = this.exp;
2479        cooley_tukey_ntt::best_ct_ntt(&mut coeffs, worker, exp, Some(worker.cpus), precomputed_omegas);
2480        let this = Polynomial::from_values(coeffs)?;
2481    
2482        Ok(this)
2483    }
2484}
2485
2486#[cfg(test)]
2487mod test {
2488    use core::num;
2489
2490
2491    #[test]
2492    fn test_shifted_grand_product() {
2493        use crate::pairing::bn256::Fr;
2494        use crate::ff::{Field, PrimeField};
2495        use super::*;
2496
2497        use rand::{XorShiftRng, SeedableRng, Rand, Rng};
2498        use crate::worker::Worker;
2499    
2500        let samples: usize = 1 << 20;
2501        let rng = &mut XorShiftRng::from_seed([0x3dbe6259, 0x8d313d76, 0x3237db17, 0xe5bc0654]);
2502    
2503        let v = (0..samples).map(|_| Fr::rand(rng)).collect::<Vec<_>>();
2504
2505        let mut manual = vec![];
2506        manual.push(Fr::one());
2507
2508        let mut tmp = Fr::one();
2509
2510        for v in v.iter() {
2511            tmp.mul_assign(&v);
2512            manual.push(tmp);
2513        }
2514
2515        let as_poly = Polynomial::from_values(v).unwrap();
2516        let worker = Worker::new();
2517        let as_poly = as_poly.calculate_shifted_grand_product(&worker).unwrap();
2518        let as_poly = as_poly.into_coeffs();
2519        for idx in 0..manual.len() {
2520            assert_eq!(manual[idx], as_poly[idx], "failed at idx = {}", idx);
2521        }
2522    }
2523
2524    #[test]
2525    fn test_grand_product() {
2526        use crate::pairing::bn256::Fr;
2527        use crate::ff::{Field, PrimeField};
2528        use super::*;
2529
2530        use rand::{XorShiftRng, SeedableRng, Rand, Rng};
2531        use crate::worker::Worker;
2532    
2533        let samples: usize = 1 << 20;
2534        let rng = &mut XorShiftRng::from_seed([0x3dbe6259, 0x8d313d76, 0x3237db17, 0xe5bc0654]);
2535    
2536        let v = (0..samples).map(|_| Fr::rand(rng)).collect::<Vec<_>>();
2537
2538        let mut manual = vec![];
2539
2540        let mut tmp = Fr::one();
2541
2542        for v in v.iter() {
2543            tmp.mul_assign(&v);
2544            manual.push(tmp);
2545        }
2546
2547        let as_poly = Polynomial::from_values(v).unwrap();
2548        let worker = Worker::new();
2549        let as_poly = as_poly.calculate_grand_product(&worker).unwrap();
2550        let as_poly = as_poly.into_coeffs();
2551        for idx in 0..manual.len() {
2552            assert_eq!(manual[idx], as_poly[idx], "failed at idx = {}", idx);
2553        }
2554    }
2555
2556    #[test]
2557    #[ignore]
2558    fn test_insane_size_lde() {
2559        use crate::pairing::bn256::Fr;
2560        use crate::ff::{Field, PrimeField};
2561        use super::*;
2562
2563        use rand::{XorShiftRng, SeedableRng, Rand, Rng};
2564        use crate::worker::Worker;
2565
2566        let max_size = 1 << 26;
2567        let worker = Worker::new();
2568
2569        assert!(worker.cpus >= 16, "should be tested only on large machines");
2570    
2571        let scalars = crate::kate_commitment::test::make_random_field_elements::<Fr>(&worker, max_size);
2572
2573        for size in vec![1 << 23, 1 << 24, 1 << 25, 1 << 26] {
2574            let poly = Polynomial::from_coeffs(scalars[..size].to_vec()).unwrap();
2575            use crate::plonk::fft::cooley_tukey_ntt::*;
2576
2577            let omegas_bitreversed = BitReversedOmegas::<Fr>::new_for_domain_size(size.next_power_of_two());
2578            for cpus in vec![4, 8, 16, 32] {
2579            // for cpus in vec![16, 24, 32] {
2580
2581                use std::time::Instant;
2582
2583                let subworker = Worker::new_with_cpus(cpus);
2584
2585                let poly = poly.clone();
2586
2587                let now = Instant::now();
2588
2589                let _ = poly.bitreversed_lde_using_bitreversed_ntt(
2590                    &subworker,
2591                    4,
2592                    &omegas_bitreversed,
2593                    &Fr::multiplicative_generator()
2594                ).unwrap();
2595
2596                println!("Total LDE time for {} points on {} cpus = {:?}", size, cpus, now.elapsed());
2597            }
2598        }
2599    }
2600
2601    #[test]
2602    #[ignore]
2603    fn test_fft_scaling() {
2604        use crate::pairing::bn256::Fr;
2605        use crate::ff::{Field, PrimeField};
2606        use super::*;
2607
2608        use rand::{XorShiftRng, SeedableRng, Rand, Rng};
2609        use crate::worker::Worker;
2610
2611        let max_size = 1 << 26;
2612        let worker = Worker::new();
2613
2614        assert!(worker.cpus >= 16, "should be tested only on large machines");
2615    
2616        let scalars = crate::kate_commitment::test::make_random_field_elements::<Fr>(&worker, max_size);
2617
2618        for size in vec![1 << 23, 1 << 24, 1 << 25, 1 << 26] {
2619            let poly = Polynomial::from_coeffs(scalars[..size].to_vec()).unwrap();
2620            use crate::plonk::fft::cooley_tukey_ntt::*;
2621
2622            let omegas_bitreversed = BitReversedOmegas::<Fr>::new_for_domain_size(size.next_power_of_two());
2623            for cpus in vec![4, 8, 16, 32] {
2624            // for cpus in vec![16, 24, 32] {
2625
2626                use std::time::Instant;
2627
2628                let subworker = Worker::new_with_cpus(cpus);
2629
2630                let poly = poly.clone();
2631
2632                let now = Instant::now();
2633
2634                let _ = poly.fft_using_bitreversed_ntt_output_bitreversed(
2635                    &subworker,
2636                    &omegas_bitreversed,
2637                    &Fr::one()
2638                    // &Fr::multiplicative_generator()
2639                ).unwrap();
2640
2641                println!("Total FFT time time for {} points on {} cpus = {:?}", size, cpus, now.elapsed());
2642            }
2643        }
2644    }
2645
2646    #[test]
2647    fn test_lde_explicit_multicore_validity() {
2648        use crate::pairing::bn256::Fr;
2649        use crate::ff::{Field, PrimeField};
2650        use super::*;
2651
2652        if cfg!(debug_assertions) {
2653            println!("Will be too slow to run in test mode, abort");
2654            return;
2655        }
2656
2657        use rand::{XorShiftRng, SeedableRng, Rand, Rng};
2658        use crate::worker::Worker;
2659
2660        let size = 1 << 21;
2661        let worker = Worker::new();
2662    
2663        let scalars = crate::kate_commitment::test::make_random_field_elements::<Fr>(&worker, size);
2664        let poly = Polynomial::from_coeffs(scalars).unwrap();
2665        let omegas_bitreversed = BitReversedOmegas::<Fr>::new_for_domain_size(size.next_power_of_two());
2666        use crate::plonk::fft::cooley_tukey_ntt::*;
2667
2668        let mut reference: Option<Polynomial<_, _>> = None;
2669
2670        for _ in 0..100 {
2671            for num_cpus in 1..=32 {
2672                let subworker = Worker::new_with_cpus(num_cpus);
2673                let poly = poly.clone();
2674
2675                // let candidate = poly.fft_using_bitreversed_ntt_output_bitreversed(
2676                //     &subworker,
2677                //     &omegas_bitreversed,
2678                //     // &Fr::one()
2679                //     &Fr::multiplicative_generator()
2680                // ).unwrap();
2681
2682                let candidate = poly.bitreversed_lde_using_bitreversed_ntt(
2683                    &subworker, 
2684                    4, 
2685                    &omegas_bitreversed, 
2686                    &Fr::multiplicative_generator()
2687                ).unwrap();
2688
2689                if let Some(to_compare) = reference.take() {
2690                    assert_eq!(candidate.as_ref(), to_compare.as_ref(), "mismatch for {} cpus", num_cpus);
2691                } else {
2692                    reference = Some(candidate);
2693                }
2694
2695                println!("Completed for {} cpus", num_cpus);
2696            }
2697        }
2698    }
2699}