tfhe_ntt/
prime64.rs

1use crate::{bit_rev, fastdiv::Div64, prime::is_prime64, roots::find_primitive_root64};
2use aligned_vec::{avec, ABox};
3
4#[allow(unused_imports)]
5use pulp::*;
6
7const RECURSION_THRESHOLD: usize = 1024;
8pub(crate) const SOLINAS_PRIME: u64 = ((1_u128 << 64) - (1_u128 << 32) + 1) as u64;
9
10mod generic_solinas;
11mod shoup;
12
13#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
14#[cfg(feature = "nightly")]
15mod less_than_50bit;
16#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
17#[cfg(feature = "nightly")]
18mod less_than_51bit;
19
20mod less_than_62bit;
21mod less_than_63bit;
22
23use self::generic_solinas::PrimeModulus;
24use crate::roots::find_root_solinas_64;
25pub use generic_solinas::Solinas;
26
27#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
28impl crate::V3 {
29    #[inline(always)]
30    fn interleave2_u64x4(self, z0z0z1z1: [u64x4; 2]) -> [u64x4; 2] {
31        let avx = self.avx;
32        [
33            cast(
34                avx._mm256_permute2f128_si256::<0b0010_0000>(cast(z0z0z1z1[0]), cast(z0z0z1z1[1])),
35            ),
36            cast(
37                avx._mm256_permute2f128_si256::<0b0011_0001>(cast(z0z0z1z1[0]), cast(z0z0z1z1[1])),
38            ),
39        ]
40    }
41
42    #[inline(always)]
43    fn permute2_u64x4(self, w: [u64; 2]) -> u64x4 {
44        let avx = self.avx;
45        let w00 = self.sse2._mm_set1_epi64x(w[0] as _);
46        let w11 = self.sse2._mm_set1_epi64x(w[1] as _);
47        cast(avx._mm256_insertf128_si256::<0b1>(avx._mm256_castsi128_si256(w00), w11))
48    }
49
50    #[inline(always)]
51    fn interleave1_u64x4(self, z0z1: [u64x4; 2]) -> [u64x4; 2] {
52        let avx = self.avx2;
53        [
54            cast(avx._mm256_unpacklo_epi64(cast(z0z1[0]), cast(z0z1[1]))),
55            cast(avx._mm256_unpackhi_epi64(cast(z0z1[0]), cast(z0z1[1]))),
56        ]
57    }
58
59    #[inline(always)]
60    fn permute1_u64x4(self, w: [u64; 4]) -> u64x4 {
61        let avx = self.avx;
62        let w0123 = pulp::cast(w);
63        let w0101 = avx._mm256_permute2f128_si256::<0b0000_0000>(w0123, w0123);
64        let w2323 = avx._mm256_permute2f128_si256::<0b0011_0011>(w0123, w0123);
65        cast(avx._mm256_castpd_si256(avx._mm256_shuffle_pd::<0b1100>(
66            avx._mm256_castsi256_pd(w0101),
67            avx._mm256_castsi256_pd(w2323),
68        )))
69    }
70
71    #[inline(always)]
72    pub fn small_mod_u64x4(self, modulus: u64x4, x: u64x4) -> u64x4 {
73        self.select_u64x4(
74            self.cmp_gt_u64x4(modulus, x),
75            x,
76            self.wrapping_sub_u64x4(x, modulus),
77        )
78    }
79}
80
81#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
82#[cfg(feature = "nightly")]
83impl crate::V4 {
84    #[inline(always)]
85    fn interleave4_u64x8(self, z0z0z0z0z1z1z1z1: [u64x8; 2]) -> [u64x8; 2] {
86        let avx = self.avx512f;
87        let idx_0 = avx._mm512_setr_epi64(0x0, 0x1, 0x2, 0x3, 0x8, 0x9, 0xa, 0xb);
88        let idx_1 = avx._mm512_setr_epi64(0x4, 0x5, 0x6, 0x7, 0xc, 0xd, 0xe, 0xf);
89        [
90            cast(avx._mm512_permutex2var_epi64(
91                cast(z0z0z0z0z1z1z1z1[0]),
92                idx_0,
93                cast(z0z0z0z0z1z1z1z1[1]),
94            )),
95            cast(avx._mm512_permutex2var_epi64(
96                cast(z0z0z0z0z1z1z1z1[0]),
97                idx_1,
98                cast(z0z0z0z0z1z1z1z1[1]),
99            )),
100        ]
101    }
102
103    #[inline(always)]
104    fn permute4_u64x8(self, w: [u64; 2]) -> u64x8 {
105        let avx = self.avx512f;
106        let w = pulp::cast(w);
107        let w01xxxxxx = avx._mm512_castsi128_si512(w);
108        let idx = avx._mm512_setr_epi64(0, 0, 0, 0, 1, 1, 1, 1);
109        cast(avx._mm512_permutexvar_epi64(idx, w01xxxxxx))
110    }
111
112    #[inline(always)]
113    fn interleave2_u64x8(self, z0z0z1z1: [u64x8; 2]) -> [u64x8; 2] {
114        let avx = self.avx512f;
115        let idx_0 = avx._mm512_setr_epi64(0x0, 0x1, 0x8, 0x9, 0x4, 0x5, 0xc, 0xd);
116        let idx_1 = avx._mm512_setr_epi64(0x2, 0x3, 0xa, 0xb, 0x6, 0x7, 0xe, 0xf);
117        [
118            cast(avx._mm512_permutex2var_epi64(cast(z0z0z1z1[0]), idx_0, cast(z0z0z1z1[1]))),
119            cast(avx._mm512_permutex2var_epi64(cast(z0z0z1z1[0]), idx_1, cast(z0z0z1z1[1]))),
120        ]
121    }
122
123    #[inline(always)]
124    fn permute2_u64x8(self, w: [u64; 4]) -> u64x8 {
125        let avx = self.avx512f;
126        let w = pulp::cast(w);
127        let w0123xxxx = avx._mm512_castsi256_si512(w);
128        let idx = avx._mm512_setr_epi64(0, 0, 2, 2, 1, 1, 3, 3);
129        cast(avx._mm512_permutexvar_epi64(idx, w0123xxxx))
130    }
131
132    #[inline(always)]
133    fn interleave1_u64x8(self, z0z1: [u64x8; 2]) -> [u64x8; 2] {
134        let avx = self.avx512f;
135        [
136            cast(avx._mm512_unpacklo_epi64(cast(z0z1[0]), cast(z0z1[1]))),
137            cast(avx._mm512_unpackhi_epi64(cast(z0z1[0]), cast(z0z1[1]))),
138        ]
139    }
140
141    #[inline(always)]
142    fn permute1_u64x8(self, w: [u64; 8]) -> u64x8 {
143        let avx = self.avx512f;
144        let w = pulp::cast(w);
145        let idx = avx._mm512_setr_epi64(0, 4, 1, 5, 2, 6, 3, 7);
146        cast(avx._mm512_permutexvar_epi64(idx, w))
147    }
148
149    #[inline(always)]
150    pub fn small_mod_u64x8(self, modulus: u64x8, x: u64x8) -> u64x8 {
151        self.select_u64x8(
152            self.cmp_gt_u64x8(modulus, x),
153            x,
154            self.wrapping_sub_u64x8(x, modulus),
155        )
156    }
157}
158
159fn init_negacyclic_twiddles(p: u64, n: usize, twid: &mut [u64], inv_twid: &mut [u64]) {
160    let div = Div64::new(p);
161
162    let w = if p == SOLINAS_PRIME {
163        // Used custom root-of-unity with Goldilocks prime
164        // Those root-of-unity enable generation of friendly twiddle will low hamming weight
165        // and enable replacement of multiplication with simple shift
166        match n {
167            32 => 8_u64,
168            64 => 2198989700608_u64,
169            128 => 14041890976876060974_u64,
170            256 => 14430643036723656017_u64,
171            512 => 4440654710286119610_u64,
172            1024 => 8816101479115663336_u64,
173            2048 => 10974926054405199669_u64,
174            4096 => 1206500561358145487_u64,
175            8192 => 10930245224889659871_u64,
176            16384 => 3333600369887534767_u64,
177            32768 => 15893793146607301539_u64,
178            _ => find_root_solinas_64(div, 2 * n as u64).unwrap(),
179        }
180    } else {
181        find_primitive_root64(div, 2 * n as u64).unwrap()
182    };
183
184    let mut k = 0;
185    let mut wk = 1u64;
186
187    let nbits = n.trailing_zeros();
188    while k < n {
189        let fwd_idx = bit_rev(nbits, k);
190
191        twid[fwd_idx] = wk;
192
193        let inv_idx = bit_rev(nbits, (n - k) % n);
194        if k == 0 {
195            inv_twid[inv_idx] = wk;
196        } else {
197            let x = p.wrapping_sub(wk);
198            inv_twid[inv_idx] = x;
199        }
200
201        wk = Div64::rem_u128(wk as u128 * w as u128, div);
202        k += 1;
203    }
204}
205
206fn init_negacyclic_twiddles_shoup(
207    p: u64,
208    n: usize,
209    max_bits: u32,
210    twid: &mut [u64],
211    twid_shoup: &mut [u64],
212    inv_twid: &mut [u64],
213    inv_twid_shoup: &mut [u64],
214) {
215    let div = Div64::new(p);
216    let w = find_primitive_root64(div, 2 * n as u64).unwrap();
217    let mut k = 0;
218    let mut wk = 1u64;
219
220    let nbits = n.trailing_zeros();
221    while k < n {
222        let fwd_idx = bit_rev(nbits, k);
223
224        let wk_shoup = Div64::div_u128((wk as u128) << max_bits, div) as u64;
225        twid[fwd_idx] = wk;
226        twid_shoup[fwd_idx] = wk_shoup;
227
228        let inv_idx = bit_rev(nbits, (n - k) % n);
229        if k == 0 {
230            inv_twid[inv_idx] = wk;
231            inv_twid_shoup[inv_idx] = wk_shoup;
232        } else {
233            let x = p.wrapping_sub(wk);
234            inv_twid[inv_idx] = x;
235            inv_twid_shoup[inv_idx] = Div64::div_u128((x as u128) << max_bits, div) as u64;
236        }
237
238        wk = Div64::rem_u128(wk as u128 * w as u128, div);
239        k += 1;
240    }
241}
242
243/// Negacyclic NTT plan for 64bit primes.
244#[derive(Clone)]
245pub struct Plan {
246    twid: ABox<[u64]>,
247    twid_shoup: ABox<[u64]>,
248    inv_twid: ABox<[u64]>,
249    inv_twid_shoup: ABox<[u64]>,
250    p: u64,
251    p_div: Div64,
252
253    // used for elementwise product
254    p_barrett: u64,
255    big_q: u64,
256
257    n_inv_mod_p: u64,
258    n_inv_mod_p_shoup: u64,
259}
260
261impl core::fmt::Debug for Plan {
262    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
263        f.debug_struct("Plan")
264            .field("ntt_size", &self.ntt_size())
265            .field("modulus", &self.modulus())
266            .finish()
267    }
268}
269
270#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
271#[cfg(feature = "nightly")]
272fn mul_assign_normalize_ifma(
273    simd: crate::V4IFma,
274    lhs: &mut [u64],
275    rhs: &[u64],
276    p: u64,
277    p_barrett: u64,
278    big_q: u64,
279    n_inv_mod_p: u64,
280    n_inv_mod_p_shoup: u64,
281) {
282    simd.vectorize(
283        #[inline(always)]
284        || {
285            let lhs = pulp::as_arrays_mut::<8, _>(lhs).0;
286            let rhs = pulp::as_arrays::<8, _>(rhs).0;
287
288            let big_q_m1 = simd.splat_u64x8(big_q - 1);
289            let big_q_m1_complement = simd.splat_u64x8(52 - (big_q - 1));
290            let n_inv_mod_p = simd.splat_u64x8(n_inv_mod_p);
291            let n_inv_mod_p_shoup = simd.splat_u64x8(n_inv_mod_p_shoup);
292            let p_barrett = simd.splat_u64x8(p_barrett);
293            let neg_p = simd.splat_u64x8(p.wrapping_neg());
294            let p = simd.splat_u64x8(p);
295            let zero = simd.splat_u64x8(0);
296
297            for (lhs_, rhs) in crate::izip!(lhs, rhs) {
298                let lhs = cast(*lhs_);
299                let rhs = cast(*rhs);
300
301                // lhs × rhs
302                let (lo, hi) = simd.widening_mul_u52x8(lhs, rhs);
303                let c1 = simd.or_u64x8(
304                    simd.shr_dyn_u64x8(lo, big_q_m1),
305                    simd.shl_dyn_u64x8(hi, big_q_m1_complement),
306                );
307                let c3 = simd.widening_mul_u52x8(c1, p_barrett).1;
308                // lo - p * c3
309                let prod = simd.wrapping_mul_add_u52x8(neg_p, c3, lo);
310
311                // normalization
312                let shoup_q = simd.widening_mul_u52x8(prod, n_inv_mod_p_shoup).1;
313                let t = simd.wrapping_mul_add_u52x8(
314                    shoup_q,
315                    neg_p,
316                    simd.wrapping_mul_add_u52x8(prod, n_inv_mod_p, zero),
317                );
318
319                *lhs_ = cast(simd.small_mod_u64x8(p, t));
320            }
321        },
322    )
323}
324
325#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
326#[cfg(feature = "nightly")]
327fn mul_accumulate_ifma(
328    simd: crate::V4IFma,
329    acc: &mut [u64],
330    lhs: &[u64],
331    rhs: &[u64],
332    p: u64,
333    p_barrett: u64,
334    big_q: u64,
335) {
336    simd.vectorize(
337        #[inline(always)]
338        || {
339            let acc = pulp::as_arrays_mut::<8, _>(acc).0;
340            let lhs = pulp::as_arrays::<8, _>(lhs).0;
341            let rhs = pulp::as_arrays::<8, _>(rhs).0;
342
343            let big_q_m1 = simd.splat_u64x8(big_q - 1);
344            let big_q_m1_complement = simd.splat_u64x8(52 - (big_q - 1));
345            let p_barrett = simd.splat_u64x8(p_barrett);
346            let neg_p = simd.splat_u64x8(p.wrapping_neg());
347            let p = simd.splat_u64x8(p);
348
349            for (acc, lhs, rhs) in crate::izip!(acc, lhs, rhs) {
350                let lhs = cast(*lhs);
351                let rhs = cast(*rhs);
352
353                // lhs × rhs
354                let (lo, hi) = simd.widening_mul_u52x8(lhs, rhs);
355                let c1 = simd.or_u64x8(
356                    simd.shr_dyn_u64x8(lo, big_q_m1),
357                    simd.shl_dyn_u64x8(hi, big_q_m1_complement),
358                );
359                let c3 = simd.widening_mul_u52x8(c1, p_barrett).1;
360                // lo - p * c3
361                let prod = simd.wrapping_mul_add_u52x8(neg_p, c3, lo);
362                let prod = simd.small_mod_u64x8(p, prod);
363
364                *acc = cast(simd.small_mod_u64x8(p, simd.wrapping_add_u64x8(prod, cast(*acc))));
365            }
366        },
367    )
368}
369
370#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
371#[cfg(feature = "nightly")]
372fn mul_assign_normalize_avx512(
373    simd: crate::V4,
374    lhs: &mut [u64],
375    rhs: &[u64],
376    p: u64,
377    p_barrett: u64,
378    big_q: u64,
379    n_inv_mod_p: u64,
380    n_inv_mod_p_shoup: u64,
381) {
382    simd.vectorize(
383        #[inline(always)]
384        move || {
385            let lhs = pulp::as_arrays_mut::<8, _>(lhs).0;
386            let rhs = pulp::as_arrays::<8, _>(rhs).0;
387
388            let big_q_m1 = simd.splat_u64x8(big_q - 1);
389            let big_q_m1_complement = simd.splat_u64x8(64 - (big_q - 1));
390            let n_inv_mod_p = simd.splat_u64x8(n_inv_mod_p);
391            let n_inv_mod_p_shoup = simd.splat_u64x8(n_inv_mod_p_shoup);
392            let p_barrett = simd.splat_u64x8(p_barrett);
393            let p = simd.splat_u64x8(p);
394
395            for (lhs_, rhs) in crate::izip!(lhs, rhs) {
396                let lhs = cast(*lhs_);
397                let rhs = cast(*rhs);
398
399                // lhs × rhs
400                let (lo, hi) = simd.widening_mul_u64x8(lhs, rhs);
401                let c1 = simd.or_u64x8(
402                    simd.shr_dyn_u64x8(lo, big_q_m1),
403                    simd.shl_dyn_u64x8(hi, big_q_m1_complement),
404                );
405                let c3 = simd.widening_mul_u64x8(c1, p_barrett).1;
406                let prod = simd.wrapping_sub_u64x8(lo, simd.wrapping_mul_u64x8(p, c3));
407
408                // normalization
409                let shoup_q = simd.widening_mul_u64x8(prod, n_inv_mod_p_shoup).1;
410                let t = simd.wrapping_sub_u64x8(
411                    simd.wrapping_mul_u64x8(prod, n_inv_mod_p),
412                    simd.wrapping_mul_u64x8(shoup_q, p),
413                );
414
415                *lhs_ = cast(simd.small_mod_u64x8(p, t));
416            }
417        },
418    );
419}
420
421#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
422#[cfg(feature = "nightly")]
423fn mul_accumulate_avx512(
424    simd: crate::V4,
425    acc: &mut [u64],
426    lhs: &[u64],
427    rhs: &[u64],
428    p: u64,
429    p_barrett: u64,
430    big_q: u64,
431) {
432    simd.vectorize(
433        #[inline(always)]
434        || {
435            let acc = pulp::as_arrays_mut::<8, _>(acc).0;
436            let lhs = pulp::as_arrays::<8, _>(lhs).0;
437            let rhs = pulp::as_arrays::<8, _>(rhs).0;
438
439            let big_q_m1 = simd.splat_u64x8(big_q - 1);
440            let big_q_m1_complement = simd.splat_u64x8(64 - (big_q - 1));
441            let p_barrett = simd.splat_u64x8(p_barrett);
442            let p = simd.splat_u64x8(p);
443
444            for (acc, lhs, rhs) in crate::izip!(acc, lhs, rhs) {
445                let lhs = cast(*lhs);
446                let rhs = cast(*rhs);
447
448                // lhs × rhs
449                let (lo, hi) = simd.widening_mul_u64x8(lhs, rhs);
450                let c1 = simd.or_u64x8(
451                    simd.shr_dyn_u64x8(lo, big_q_m1),
452                    simd.shl_dyn_u64x8(hi, big_q_m1_complement),
453                );
454                let c3 = simd.widening_mul_u64x8(c1, p_barrett).1;
455                // lo - p * c3
456                let prod = simd.wrapping_sub_u64x8(lo, simd.wrapping_mul_u64x8(p, c3));
457                let prod = simd.small_mod_u64x8(p, prod);
458
459                *acc = cast(simd.small_mod_u64x8(p, simd.wrapping_add_u64x8(prod, cast(*acc))));
460            }
461        },
462    )
463}
464
465#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
466fn mul_assign_normalize_avx2(
467    simd: crate::V3,
468    lhs: &mut [u64],
469    rhs: &[u64],
470    p: u64,
471    p_barrett: u64,
472    big_q: u64,
473    n_inv_mod_p: u64,
474    n_inv_mod_p_shoup: u64,
475) {
476    simd.vectorize(
477        #[inline(always)]
478        move || {
479            let lhs = pulp::as_arrays_mut::<4, _>(lhs).0;
480            let rhs = pulp::as_arrays::<4, _>(rhs).0;
481            let big_q_m1 = simd.splat_u64x4(big_q - 1);
482            let big_q_m1_complement = simd.splat_u64x4(64 - (big_q - 1));
483            let n_inv_mod_p = simd.splat_u64x4(n_inv_mod_p);
484            let n_inv_mod_p_shoup = simd.splat_u64x4(n_inv_mod_p_shoup);
485            let p_barrett = simd.splat_u64x4(p_barrett);
486            let p = simd.splat_u64x4(p);
487
488            for (lhs_, rhs) in crate::izip!(lhs, rhs) {
489                let lhs = cast(*lhs_);
490                let rhs = cast(*rhs);
491
492                // lhs × rhs
493                let (lo, hi) = simd.widening_mul_u64x4(lhs, rhs);
494                let c1 = simd.or_u64x4(
495                    simd.shr_dyn_u64x4(lo, big_q_m1),
496                    simd.shl_dyn_u64x4(hi, big_q_m1_complement),
497                );
498                let c3 = simd.widening_mul_u64x4(c1, p_barrett).1;
499                let prod = simd.wrapping_sub_u64x4(lo, simd.widening_mul_u64x4(p, c3).0);
500
501                // normalization
502                let shoup_q = simd.widening_mul_u64x4(prod, n_inv_mod_p_shoup).1;
503                let t = simd.wrapping_sub_u64x4(
504                    simd.widening_mul_u64x4(prod, n_inv_mod_p).0,
505                    simd.widening_mul_u64x4(shoup_q, p).0,
506                );
507
508                *lhs_ = cast(simd.small_mod_u64x4(p, t));
509            }
510        },
511    );
512}
513
514#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
515fn mul_accumulate_avx2(
516    simd: crate::V3,
517    acc: &mut [u64],
518    lhs: &[u64],
519    rhs: &[u64],
520    p: u64,
521    p_barrett: u64,
522    big_q: u64,
523) {
524    simd.vectorize(
525        #[inline(always)]
526        || {
527            let acc = pulp::as_arrays_mut::<4, _>(acc).0;
528            let lhs = pulp::as_arrays::<4, _>(lhs).0;
529            let rhs = pulp::as_arrays::<4, _>(rhs).0;
530
531            let big_q_m1 = simd.splat_u64x4(big_q - 1);
532            let big_q_m1_complement = simd.splat_u64x4(64 - (big_q - 1));
533            let p_barrett = simd.splat_u64x4(p_barrett);
534            let p = simd.splat_u64x4(p);
535
536            for (acc, lhs, rhs) in crate::izip!(acc, lhs, rhs) {
537                let lhs = cast(*lhs);
538                let rhs = cast(*rhs);
539
540                // lhs × rhs
541                let (lo, hi) = simd.widening_mul_u64x4(lhs, rhs);
542                let c1 = simd.or_u64x4(
543                    simd.shr_dyn_u64x4(lo, big_q_m1),
544                    simd.shl_dyn_u64x4(hi, big_q_m1_complement),
545                );
546                let c3 = simd.widening_mul_u64x4(c1, p_barrett).1;
547                // lo - p * c3
548                let prod = simd.wrapping_sub_u64x4(lo, simd.widening_mul_u64x4(p, c3).0);
549                let prod = simd.small_mod_u64x4(p, prod);
550
551                *acc = cast(simd.small_mod_u64x4(p, simd.wrapping_add_u64x4(prod, cast(*acc))));
552            }
553        },
554    )
555}
556
557fn mul_assign_normalize_scalar(
558    lhs: &mut [u64],
559    rhs: &[u64],
560    p: u64,
561    p_barrett: u64,
562    big_q: u64,
563    n_inv_mod_p: u64,
564    n_inv_mod_p_shoup: u64,
565) {
566    let big_q_m1 = big_q - 1;
567
568    for (lhs_, rhs) in crate::izip!(lhs, rhs) {
569        let lhs = *lhs_;
570        let rhs = *rhs;
571
572        let d = lhs as u128 * rhs as u128;
573        let c1 = (d >> big_q_m1) as u64;
574        let c3 = ((c1 as u128 * p_barrett as u128) >> 64) as u64;
575        let prod = (d as u64).wrapping_sub(p.wrapping_mul(c3));
576
577        let shoup_q = (((prod as u128) * (n_inv_mod_p_shoup as u128)) >> 64) as u64;
578        let t = u64::wrapping_sub(prod.wrapping_mul(n_inv_mod_p), shoup_q.wrapping_mul(p));
579
580        *lhs_ = t.min(t.wrapping_sub(p));
581    }
582}
583
584fn mul_accumulate_scalar(
585    acc: &mut [u64],
586    lhs: &[u64],
587    rhs: &[u64],
588    p: u64,
589    p_barrett: u64,
590    big_q: u64,
591) {
592    let big_q_m1 = big_q - 1;
593
594    for (acc, lhs, rhs) in crate::izip!(acc, lhs, rhs) {
595        let lhs = *lhs;
596        let rhs = *rhs;
597
598        let d = lhs as u128 * rhs as u128;
599        let c1 = (d >> big_q_m1) as u64;
600        let c3 = ((c1 as u128 * p_barrett as u128) >> 64) as u64;
601        let prod = (d as u64).wrapping_sub(p.wrapping_mul(c3));
602        let prod = prod.min(prod.wrapping_sub(p));
603
604        let acc_ = prod + *acc;
605        *acc = acc_.min(acc_.wrapping_sub(p));
606    }
607}
608
609#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
610#[cfg(feature = "nightly")]
611fn normalize_ifma(
612    simd: crate::V4IFma,
613    values: &mut [u64],
614    p: u64,
615    n_inv_mod_p: u64,
616    n_inv_mod_p_shoup: u64,
617) {
618    simd.vectorize(
619        #[inline(always)]
620        || {
621            let values = pulp::as_arrays_mut::<8, _>(values).0;
622
623            let n_inv_mod_p = simd.splat_u64x8(n_inv_mod_p);
624            let n_inv_mod_p_shoup = simd.splat_u64x8(n_inv_mod_p_shoup);
625            let neg_p = simd.splat_u64x8(p.wrapping_neg());
626            let p = simd.splat_u64x8(p);
627            let zero = simd.splat_u64x8(0);
628
629            for val_ in values {
630                let val = cast(*val_);
631
632                // normalization
633                let shoup_q = simd.widening_mul_u52x8(val, n_inv_mod_p_shoup).1;
634                let t = simd.wrapping_mul_add_u52x8(
635                    shoup_q,
636                    neg_p,
637                    simd.wrapping_mul_add_u52x8(val, n_inv_mod_p, zero),
638                );
639
640                *val_ = cast(simd.small_mod_u64x8(p, t));
641            }
642        },
643    )
644}
645
646#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
647#[cfg(feature = "nightly")]
648fn normalize_avx512(
649    simd: crate::V4,
650    values: &mut [u64],
651    p: u64,
652    n_inv_mod_p: u64,
653    n_inv_mod_p_shoup: u64,
654) {
655    simd.vectorize(
656        #[inline(always)]
657        move || {
658            let values = pulp::as_arrays_mut::<8, _>(values).0;
659
660            let n_inv_mod_p = simd.splat_u64x8(n_inv_mod_p);
661            let n_inv_mod_p_shoup = simd.splat_u64x8(n_inv_mod_p_shoup);
662            let p = simd.splat_u64x8(p);
663
664            for val_ in values {
665                let val = cast(*val_);
666
667                // normalization
668                let shoup_q = simd.widening_mul_u64x8(val, n_inv_mod_p_shoup).1;
669                let t = simd.wrapping_sub_u64x8(
670                    simd.wrapping_mul_u64x8(val, n_inv_mod_p),
671                    simd.wrapping_mul_u64x8(shoup_q, p),
672                );
673
674                *val_ = cast(simd.small_mod_u64x8(p, t));
675            }
676        },
677    );
678}
679
680#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
681fn normalize_avx2(
682    simd: crate::V3,
683    values: &mut [u64],
684    p: u64,
685    n_inv_mod_p: u64,
686    n_inv_mod_p_shoup: u64,
687) {
688    simd.vectorize(
689        #[inline(always)]
690        move || {
691            let values = pulp::as_arrays_mut::<4, _>(values).0;
692
693            let n_inv_mod_p = simd.splat_u64x4(n_inv_mod_p);
694            let n_inv_mod_p_shoup = simd.splat_u64x4(n_inv_mod_p_shoup);
695            let p = simd.splat_u64x4(p);
696
697            for val_ in values {
698                let val = cast(*val_);
699
700                // normalization
701                let shoup_q = simd.widening_mul_u64x4(val, n_inv_mod_p_shoup).1;
702                let t = simd.wrapping_sub_u64x4(
703                    simd.widening_mul_u64x4(val, n_inv_mod_p).0,
704                    simd.widening_mul_u64x4(shoup_q, p).0,
705                );
706
707                *val_ = cast(simd.small_mod_u64x4(p, t));
708            }
709        },
710    );
711}
712
713fn normalize_scalar(values: &mut [u64], p: u64, n_inv_mod_p: u64, n_inv_mod_p_shoup: u64) {
714    for val_ in values {
715        let val = *val_;
716
717        let shoup_q = (((val as u128) * (n_inv_mod_p_shoup as u128)) >> 64) as u64;
718        let t = u64::wrapping_sub(val.wrapping_mul(n_inv_mod_p), shoup_q.wrapping_mul(p));
719
720        *val_ = t.min(t.wrapping_sub(p));
721    }
722}
723
724impl Plan {
725    /// Returns a negacyclic NTT plan for the given polynomial size and modulus, or `None` if no
726    /// suitable roots of unity can be found for the wanted parameters.
727    pub fn try_new(polynomial_size: usize, modulus: u64) -> Option<Self> {
728        let p_div = Div64::new(modulus);
729        // 16 = 8x2 = max_register_size * ntt_radix,
730        // as SIMD registers can contain at most 8*u64
731        // and the implementation assumes that SIMD registers are full
732        if polynomial_size < 16
733            || !polynomial_size.is_power_of_two()
734            || !is_prime64(modulus)
735            || find_primitive_root64(p_div, 2 * polynomial_size as u64).is_none()
736        {
737            None
738        } else {
739            #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
740            #[cfg(feature = "nightly")]
741            let has_ifma = (modulus < (1u64 << 51)) && crate::V4IFma::try_new().is_some();
742            #[cfg(not(all(
743                any(target_arch = "x86", target_arch = "x86_64"),
744                feature = "nightly",
745            )))]
746            let has_ifma = false;
747
748            let bits = if has_ifma { 52 } else { 64 };
749
750            let mut twid = avec![0u64; polynomial_size].into_boxed_slice();
751            let mut inv_twid = avec![0u64; polynomial_size].into_boxed_slice();
752            let (mut twid_shoup, mut inv_twid_shoup) = if modulus < (1u64 << 63) {
753                (
754                    avec![0u64; polynomial_size].into_boxed_slice(),
755                    avec![0u64; polynomial_size].into_boxed_slice(),
756                )
757            } else {
758                (avec![].into_boxed_slice(), avec![].into_boxed_slice())
759            };
760
761            if modulus < (1u64 << 63) {
762                init_negacyclic_twiddles_shoup(
763                    modulus,
764                    polynomial_size,
765                    bits,
766                    &mut twid,
767                    &mut twid_shoup,
768                    &mut inv_twid,
769                    &mut inv_twid_shoup,
770                );
771            } else {
772                init_negacyclic_twiddles(modulus, polynomial_size, &mut twid, &mut inv_twid);
773            }
774
775            let n_inv_mod_p = crate::prime::exp_mod64(p_div, polynomial_size as u64, modulus - 2);
776            let n_inv_mod_p_shoup = (((n_inv_mod_p as u128) << bits) / modulus as u128) as u64;
777            let big_q = (modulus.ilog2() + 1) as u64;
778            let big_l = big_q + (bits - 1) as u64;
779            let p_barrett = ((1u128 << big_l) / modulus as u128) as u64;
780
781            Some(Self {
782                twid,
783                twid_shoup,
784                inv_twid_shoup,
785                inv_twid,
786                p: modulus,
787                p_div,
788                p_barrett,
789                big_q,
790                n_inv_mod_p,
791                n_inv_mod_p_shoup,
792            })
793        }
794    }
795
796    pub(crate) fn p_div(&self) -> Div64 {
797        self.p_div
798    }
799
800    /// Returns the polynomial size of the negacyclic NTT plan.
801    #[inline]
802    pub fn ntt_size(&self) -> usize {
803        self.twid.len()
804    }
805
806    /// Returns the modulus of the negacyclic NTT plan.
807    #[inline]
808    pub fn modulus(&self) -> u64 {
809        self.p
810    }
811
812    /// Applies a forward negacyclic NTT transform in place to the given buffer.
813    ///
814    /// # Note
815    /// On entry, the buffer holds the polynomial coefficients in standard order. On exit, the
816    /// buffer holds the negacyclic NTT transform coefficients in bit reversed order.
817    pub fn fwd(&self, buf: &mut [u64]) {
818        assert_eq!(buf.len(), self.ntt_size());
819        let p = self.p;
820
821        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
822        #[cfg(feature = "nightly")]
823        if p < (1u64 << 50) {
824            if let Some(simd) = crate::V4IFma::try_new() {
825                less_than_50bit::fwd_avx512(simd, p, buf, &self.twid, &self.twid_shoup);
826                return;
827            }
828        } else if p < (1u64 << 51) {
829            if let Some(simd) = crate::V4IFma::try_new() {
830                less_than_51bit::fwd_avx512(simd, p, buf, &self.twid, &self.twid_shoup);
831                return;
832            }
833        }
834
835        if p < (1u64 << 62) {
836            #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
837            {
838                #[cfg(feature = "nightly")]
839                if let Some(simd) = crate::V4::try_new() {
840                    less_than_62bit::fwd_avx512(simd, p, buf, &self.twid, &self.twid_shoup);
841                    return;
842                }
843                if let Some(simd) = crate::V3::try_new() {
844                    less_than_62bit::fwd_avx2(simd, p, buf, &self.twid, &self.twid_shoup);
845                    return;
846                }
847            }
848            less_than_62bit::fwd_scalar(p, buf, &self.twid, &self.twid_shoup);
849        } else if p < (1u64 << 63) {
850            #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
851            {
852                #[cfg(feature = "nightly")]
853                if let Some(simd) = crate::V4::try_new() {
854                    less_than_63bit::fwd_avx512(simd, p, buf, &self.twid, &self.twid_shoup);
855                    return;
856                }
857                if let Some(simd) = crate::V3::try_new() {
858                    less_than_63bit::fwd_avx2(simd, p, buf, &self.twid, &self.twid_shoup);
859                    return;
860                }
861            }
862            less_than_63bit::fwd_scalar(p, buf, &self.twid, &self.twid_shoup);
863        } else if p == Solinas::P {
864            #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
865            {
866                #[cfg(feature = "nightly")]
867                if let Some(simd) = crate::V4::try_new() {
868                    generic_solinas::fwd_avx512(simd, buf, Solinas, (), &self.twid);
869                    return;
870                }
871                if let Some(simd) = crate::V3::try_new() {
872                    generic_solinas::fwd_avx2(simd, buf, Solinas, (), &self.twid);
873                    return;
874                }
875            }
876            generic_solinas::fwd_scalar(buf, Solinas, (), &self.twid);
877        } else {
878            #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
879            #[cfg(feature = "nightly")]
880            if let Some(simd) = crate::V4::try_new() {
881                let crate::u256 { x0, x1, x2, x3 } = self.p_div.double_reciprocal;
882                let p_div = (p, x0, x1, x2, x3);
883                generic_solinas::fwd_avx512(simd, buf, p, p_div, &self.twid);
884                return;
885            }
886            generic_solinas::fwd_scalar(buf, p, self.p_div, &self.twid);
887        }
888    }
889
890    /// Applies an inverse negacyclic NTT transform in place to the given buffer.
891    ///
892    /// # Note
893    /// On entry, the buffer holds the negacyclic NTT transform coefficients in bit reversed order.
894    /// On exit, the buffer holds the polynomial coefficients in standard order.
895    pub fn inv(&self, buf: &mut [u64]) {
896        assert_eq!(buf.len(), self.ntt_size());
897        let p = self.p;
898
899        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
900        #[cfg(feature = "nightly")]
901        if p < (1u64 << 50) {
902            if let Some(simd) = crate::V4IFma::try_new() {
903                less_than_50bit::inv_avx512(simd, p, buf, &self.inv_twid, &self.inv_twid_shoup);
904                return;
905            }
906        } else if p < (1u64 << 51) {
907            if let Some(simd) = crate::V4IFma::try_new() {
908                less_than_51bit::inv_avx512(simd, p, buf, &self.inv_twid, &self.inv_twid_shoup);
909                return;
910            }
911        }
912
913        if p < (1u64 << 62) {
914            #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
915            {
916                #[cfg(feature = "nightly")]
917                if let Some(simd) = crate::V4::try_new() {
918                    less_than_62bit::inv_avx512(simd, p, buf, &self.inv_twid, &self.inv_twid_shoup);
919                    return;
920                }
921                if let Some(simd) = crate::V3::try_new() {
922                    less_than_62bit::inv_avx2(simd, p, buf, &self.inv_twid, &self.inv_twid_shoup);
923                    return;
924                }
925            }
926            less_than_62bit::inv_scalar(p, buf, &self.inv_twid, &self.inv_twid_shoup);
927        } else if p < (1u64 << 63) {
928            #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
929            {
930                #[cfg(feature = "nightly")]
931                if let Some(simd) = crate::V4::try_new() {
932                    less_than_63bit::inv_avx512(simd, p, buf, &self.inv_twid, &self.inv_twid_shoup);
933                    return;
934                }
935                if let Some(simd) = crate::V3::try_new() {
936                    less_than_63bit::inv_avx2(simd, p, buf, &self.inv_twid, &self.inv_twid_shoup);
937                    return;
938                }
939            }
940            less_than_63bit::inv_scalar(p, buf, &self.inv_twid, &self.inv_twid_shoup);
941        } else if p == Solinas::P {
942            #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
943            {
944                #[cfg(feature = "nightly")]
945                if let Some(simd) = crate::V4::try_new() {
946                    generic_solinas::inv_avx512(simd, buf, Solinas, (), &self.inv_twid);
947                    return;
948                }
949                if let Some(simd) = crate::V3::try_new() {
950                    generic_solinas::inv_avx2(simd, buf, Solinas, (), &self.inv_twid);
951                    return;
952                }
953            }
954            generic_solinas::inv_scalar(buf, Solinas, (), &self.inv_twid);
955        } else {
956            #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
957            #[cfg(feature = "nightly")]
958            if let Some(simd) = crate::V4::try_new() {
959                let crate::u256 { x0, x1, x2, x3 } = self.p_div.double_reciprocal;
960                let p_div = (p, x0, x1, x2, x3);
961                generic_solinas::inv_avx512(simd, buf, p, p_div, &self.inv_twid);
962                return;
963            }
964            generic_solinas::inv_scalar(buf, p, self.p_div, &self.inv_twid);
965        }
966    }
967
968    /// Computes the elementwise product of `lhs` and `rhs`, multiplied by the inverse of the
969    /// polynomial modulo the NTT modulus, and stores the result in `lhs`.
970    pub fn mul_assign_normalize(&self, lhs: &mut [u64], rhs: &[u64]) {
971        let p = self.p;
972
973        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
974        #[cfg(feature = "nightly")]
975        let has_ifma = (p < (1u64 << 51)) && crate::V4IFma::try_new().is_some();
976
977        if p < (1 << 63) {
978            #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
979            #[cfg(feature = "nightly")]
980            if has_ifma {
981                // p < 2^51
982                let simd = crate::V4IFma::try_new().unwrap();
983                mul_assign_normalize_ifma(
984                    simd,
985                    lhs,
986                    rhs,
987                    p,
988                    self.p_barrett,
989                    self.big_q,
990                    self.n_inv_mod_p,
991                    self.n_inv_mod_p_shoup,
992                );
993                return;
994            }
995
996            #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
997            #[cfg(feature = "nightly")]
998            if let Some(simd) = crate::V4::try_new() {
999                mul_assign_normalize_avx512(
1000                    simd,
1001                    lhs,
1002                    rhs,
1003                    p,
1004                    self.p_barrett,
1005                    self.big_q,
1006                    self.n_inv_mod_p,
1007                    self.n_inv_mod_p_shoup,
1008                );
1009                return;
1010            }
1011
1012            #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
1013            if let Some(simd) = crate::V3::try_new() {
1014                mul_assign_normalize_avx2(
1015                    simd,
1016                    lhs,
1017                    rhs,
1018                    p,
1019                    self.p_barrett,
1020                    self.big_q,
1021                    self.n_inv_mod_p,
1022                    self.n_inv_mod_p_shoup,
1023                );
1024                return;
1025            }
1026
1027            mul_assign_normalize_scalar(
1028                lhs,
1029                rhs,
1030                p,
1031                self.p_barrett,
1032                self.big_q,
1033                self.n_inv_mod_p,
1034                self.n_inv_mod_p_shoup,
1035            );
1036        } else if p == Solinas::P {
1037            let n_inv_mod_p = self.n_inv_mod_p;
1038            for (lhs_, rhs) in crate::izip!(lhs, rhs) {
1039                let lhs = *lhs_;
1040                let rhs = *rhs;
1041                let prod = <Solinas as PrimeModulus>::mul((), lhs, rhs);
1042                let prod = <Solinas as PrimeModulus>::mul((), prod, n_inv_mod_p);
1043                *lhs_ = prod;
1044            }
1045        } else {
1046            let p_div = self.p_div;
1047            let n_inv_mod_p = self.n_inv_mod_p;
1048            for (lhs_, rhs) in crate::izip!(lhs, rhs) {
1049                let lhs = *lhs_;
1050                let rhs = *rhs;
1051                let prod = <u64 as PrimeModulus>::mul(p_div, lhs, rhs);
1052                let prod = <u64 as PrimeModulus>::mul(p_div, prod, n_inv_mod_p);
1053                *lhs_ = prod;
1054            }
1055        }
1056    }
1057
1058    /// Multiplies the values by the inverse of the polynomial modulo the NTT modulus, and stores
1059    /// the result in `values`.
1060    pub fn normalize(&self, values: &mut [u64]) {
1061        let p = self.p;
1062
1063        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
1064        #[cfg(feature = "nightly")]
1065        let has_ifma = (p < (1u64 << 51)) && crate::V4IFma::try_new().is_some();
1066
1067        if p < (1 << 63) {
1068            #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
1069            #[cfg(feature = "nightly")]
1070            if has_ifma {
1071                // p < 2^51
1072                let simd = crate::V4IFma::try_new().unwrap();
1073                normalize_ifma(simd, values, p, self.n_inv_mod_p, self.n_inv_mod_p_shoup);
1074                return;
1075            }
1076
1077            #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
1078            #[cfg(feature = "nightly")]
1079            if let Some(simd) = crate::V4::try_new() {
1080                normalize_avx512(simd, values, p, self.n_inv_mod_p, self.n_inv_mod_p_shoup);
1081                return;
1082            }
1083
1084            #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
1085            if let Some(simd) = crate::V3::try_new() {
1086                normalize_avx2(simd, values, p, self.n_inv_mod_p, self.n_inv_mod_p_shoup);
1087                return;
1088            }
1089
1090            normalize_scalar(values, p, self.n_inv_mod_p, self.n_inv_mod_p_shoup);
1091        } else if p == Solinas::P {
1092            let n_inv_mod_p = self.n_inv_mod_p;
1093            for val in values {
1094                let prod = <Solinas as PrimeModulus>::mul((), *val, n_inv_mod_p);
1095                *val = prod;
1096            }
1097        } else {
1098            let n_inv_mod_p = self.n_inv_mod_p;
1099            let p_div = self.p_div;
1100            for val in values {
1101                let prod = <u64 as PrimeModulus>::mul(p_div, *val, n_inv_mod_p);
1102                *val = prod;
1103            }
1104        }
1105    }
1106
1107    /// Computes the elementwise product of `lhs` and `rhs` and accumulates the result to `acc`.
1108    pub fn mul_accumulate(&self, acc: &mut [u64], lhs: &[u64], rhs: &[u64]) {
1109        let p = self.p;
1110
1111        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
1112        #[cfg(feature = "nightly")]
1113        let has_ifma = (p < (1u64 << 51)) && crate::V4IFma::try_new().is_some();
1114
1115        if p < (1 << 63) {
1116            #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
1117            #[cfg(feature = "nightly")]
1118            if has_ifma {
1119                // p < 2^51
1120                let simd = crate::V4IFma::try_new().unwrap();
1121                mul_accumulate_ifma(simd, acc, lhs, rhs, p, self.p_barrett, self.big_q);
1122                return;
1123            }
1124
1125            #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
1126            #[cfg(feature = "nightly")]
1127            if let Some(simd) = crate::V4::try_new() {
1128                mul_accumulate_avx512(simd, acc, lhs, rhs, p, self.p_barrett, self.big_q);
1129                return;
1130            }
1131
1132            #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
1133            if let Some(simd) = crate::V3::try_new() {
1134                mul_accumulate_avx2(simd, acc, lhs, rhs, p, self.p_barrett, self.big_q);
1135                return;
1136            }
1137
1138            mul_accumulate_scalar(acc, lhs, rhs, p, self.p_barrett, self.big_q);
1139        } else if p == Solinas::P {
1140            for (acc, lhs, rhs) in crate::izip!(acc, lhs, rhs) {
1141                let prod = <Solinas as PrimeModulus>::mul((), *lhs, *rhs);
1142                *acc = <Solinas as PrimeModulus>::add(Solinas, *acc, prod);
1143            }
1144        } else {
1145            let p_div = self.p_div;
1146            for (acc, lhs, rhs) in crate::izip!(acc, lhs, rhs) {
1147                let prod = <u64 as PrimeModulus>::mul(p_div, *lhs, *rhs);
1148                *acc = <u64 as PrimeModulus>::add(p, *acc, prod);
1149            }
1150        }
1151    }
1152}
1153
1154#[cfg(test)]
1155pub mod tests {
1156    use super::*;
1157    use crate::{
1158        fastdiv::Div64, prime::largest_prime_in_arithmetic_progression64,
1159        prime64::generic_solinas::PrimeModulus,
1160    };
1161    use alloc::{vec, vec::Vec};
1162    use rand::random;
1163
1164    extern crate alloc;
1165
1166    pub fn add(p: u64, a: u64, b: u64) -> u64 {
1167        let neg_b = p.wrapping_sub(b);
1168        if a >= neg_b {
1169            a - neg_b
1170        } else {
1171            a + b
1172        }
1173    }
1174
1175    pub fn sub(p: u64, a: u64, b: u64) -> u64 {
1176        let neg_b = p.wrapping_sub(b);
1177        if a >= b {
1178            a - b
1179        } else {
1180            a + neg_b
1181        }
1182    }
1183
1184    pub fn mul(p: u64, a: u64, b: u64) -> u64 {
1185        let wide = a as u128 * b as u128;
1186        if p == 0 {
1187            wide as u64
1188        } else {
1189            (wide % p as u128) as u64
1190        }
1191    }
1192
1193    pub fn negacyclic_convolution(n: usize, p: u64, lhs: &[u64], rhs: &[u64]) -> vec::Vec<u64> {
1194        let mut full_convolution = vec![0u64; 2 * n];
1195        let mut negacyclic_convolution = vec![0u64; n];
1196        for i in 0..n {
1197            for j in 0..n {
1198                full_convolution[i + j] = add(p, full_convolution[i + j], mul(p, lhs[i], rhs[j]));
1199            }
1200        }
1201        for i in 0..n {
1202            negacyclic_convolution[i] = sub(p, full_convolution[i], full_convolution[i + n]);
1203        }
1204        negacyclic_convolution
1205    }
1206
1207    pub fn random_lhs_rhs_with_negacyclic_convolution(
1208        n: usize,
1209        p: u64,
1210    ) -> (vec::Vec<u64>, vec::Vec<u64>, vec::Vec<u64>) {
1211        let mut lhs = vec![0u64; n];
1212        let mut rhs = vec![0u64; n];
1213
1214        for x in &mut lhs {
1215            *x = random();
1216            if p != 0 {
1217                *x %= p;
1218            }
1219        }
1220        for x in &mut rhs {
1221            *x = random();
1222            if p != 0 {
1223                *x %= p;
1224            }
1225        }
1226
1227        let lhs = lhs;
1228        let rhs = rhs;
1229
1230        let negacyclic_convolution = negacyclic_convolution(n, p, &lhs, &rhs);
1231        (lhs, rhs, negacyclic_convolution)
1232    }
1233
1234    #[test]
1235    fn test_product() {
1236        for n in [16, 32, 64, 128, 256, 512, 1024] {
1237            for p in [
1238                largest_prime_in_arithmetic_progression64(1 << 16, 1, 1 << 49, 1 << 50).unwrap(),
1239                largest_prime_in_arithmetic_progression64(1 << 16, 1, 1 << 50, 1 << 51).unwrap(),
1240                largest_prime_in_arithmetic_progression64(1 << 16, 1, 1 << 61, 1 << 62).unwrap(),
1241                largest_prime_in_arithmetic_progression64(1 << 16, 1, 1 << 62, 1 << 63).unwrap(),
1242                Solinas::P,
1243                largest_prime_in_arithmetic_progression64(1 << 16, 1, 1 << 63, u64::MAX).unwrap(),
1244            ] {
1245                let plan = Plan::try_new(n, p).unwrap();
1246
1247                let (lhs, rhs, negacyclic_convolution) =
1248                    random_lhs_rhs_with_negacyclic_convolution(n, p);
1249
1250                let mut prod = vec![0u64; n];
1251                let mut lhs_fourier = lhs.clone();
1252                let mut rhs_fourier = rhs.clone();
1253
1254                plan.fwd(&mut lhs_fourier);
1255                plan.fwd(&mut rhs_fourier);
1256
1257                for x in &lhs_fourier {
1258                    assert!(*x < p);
1259                }
1260                for x in &rhs_fourier {
1261                    assert!(*x < p);
1262                }
1263
1264                for i in 0..n {
1265                    prod[i] =
1266                        <u64 as PrimeModulus>::mul(Div64::new(p), lhs_fourier[i], rhs_fourier[i]);
1267                }
1268                plan.inv(&mut prod);
1269
1270                plan.mul_assign_normalize(&mut lhs_fourier, &rhs_fourier);
1271                plan.inv(&mut lhs_fourier);
1272
1273                for x in &prod {
1274                    assert!(*x < p);
1275                }
1276
1277                for i in 0..n {
1278                    assert_eq!(
1279                        prod[i],
1280                        <u64 as PrimeModulus>::mul(
1281                            Div64::new(p),
1282                            negacyclic_convolution[i],
1283                            n as u64
1284                        ),
1285                    );
1286                }
1287                assert_eq!(lhs_fourier, negacyclic_convolution);
1288            }
1289        }
1290    }
1291
1292    #[test]
1293    fn test_normalize_scalar() {
1294        let p = largest_prime_in_arithmetic_progression64(1 << 16, 1, 1 << 50, 1 << 63).unwrap();
1295        let p_div = Div64::new(p);
1296        let polynomial_size = 128;
1297
1298        let n_inv_mod_p = crate::prime::exp_mod64(p_div, polynomial_size as u64, p - 2);
1299        let n_inv_mod_p_shoup = (((n_inv_mod_p as u128) << 64) / p as u128) as u64;
1300
1301        let mut val = (0..polynomial_size)
1302            .map(|_| rand::random::<u64>() % p)
1303            .collect::<Vec<_>>();
1304        let mut val_target = val.clone();
1305
1306        let mul = |a: u64, b: u64| ((a as u128 * b as u128) % p as u128) as u64;
1307
1308        for val in val_target.iter_mut() {
1309            *val = mul(*val, n_inv_mod_p);
1310        }
1311
1312        normalize_scalar(&mut val, p, n_inv_mod_p, n_inv_mod_p_shoup);
1313        assert_eq!(val, val_target);
1314    }
1315
1316    #[test]
1317    fn test_mul_assign_normalize_scalar() {
1318        let p = largest_prime_in_arithmetic_progression64(1 << 16, 1, 1 << 50, 1 << 63).unwrap();
1319        let p_div = Div64::new(p);
1320        let polynomial_size = 128;
1321
1322        let n_inv_mod_p = crate::prime::exp_mod64(p_div, polynomial_size as u64, p - 2);
1323        let n_inv_mod_p_shoup = (((n_inv_mod_p as u128) << 64) / p as u128) as u64;
1324        let big_q = (p.ilog2() + 1) as u64;
1325        let big_l = big_q + 63;
1326        let p_barrett = ((1u128 << big_l) / p as u128) as u64;
1327
1328        let mut lhs = (0..polynomial_size)
1329            .map(|_| rand::random::<u64>() % p)
1330            .collect::<Vec<_>>();
1331        let mut lhs_target = lhs.clone();
1332        let rhs = (0..polynomial_size)
1333            .map(|_| rand::random::<u64>() % p)
1334            .collect::<Vec<_>>();
1335
1336        let mul = |a: u64, b: u64| ((a as u128 * b as u128) % p as u128) as u64;
1337
1338        for (lhs, rhs) in lhs_target.iter_mut().zip(&rhs) {
1339            *lhs = mul(mul(*lhs, *rhs), n_inv_mod_p);
1340        }
1341
1342        mul_assign_normalize_scalar(
1343            &mut lhs,
1344            &rhs,
1345            p,
1346            p_barrett,
1347            big_q,
1348            n_inv_mod_p,
1349            n_inv_mod_p_shoup,
1350        );
1351        assert_eq!(lhs, lhs_target);
1352    }
1353
1354    #[test]
1355    fn test_mul_accumulate_scalar() {
1356        let p = largest_prime_in_arithmetic_progression64(1 << 16, 1, 1 << 50, 1 << 63).unwrap();
1357        let polynomial_size = 128;
1358
1359        let big_q = (p.ilog2() + 1) as u64;
1360        let big_l = big_q + 63;
1361        let p_barrett = ((1u128 << big_l) / p as u128) as u64;
1362
1363        let mut acc = (0..polynomial_size)
1364            .map(|_| rand::random::<u64>() % p)
1365            .collect::<Vec<_>>();
1366        let mut acc_target = acc.clone();
1367        let lhs = (0..polynomial_size)
1368            .map(|_| rand::random::<u64>() % p)
1369            .collect::<Vec<_>>();
1370        let rhs = (0..polynomial_size)
1371            .map(|_| rand::random::<u64>() % p)
1372            .collect::<Vec<_>>();
1373
1374        let mul = |a: u64, b: u64| ((a as u128 * b as u128) % p as u128) as u64;
1375        let add = |a: u64, b: u64| <u64 as PrimeModulus>::add(p, a, b);
1376
1377        for (acc, lhs, rhs) in crate::izip!(&mut acc_target, &lhs, &rhs) {
1378            *acc = add(mul(*lhs, *rhs), *acc);
1379        }
1380
1381        mul_accumulate_scalar(&mut acc, &lhs, &rhs, p, p_barrett, big_q);
1382        assert_eq!(acc, acc_target);
1383    }
1384
1385    #[test]
1386    fn test_mul_accumulate() {
1387        for p in [
1388            largest_prime_in_arithmetic_progression64(1 << 16, 1, 0, 1 << 51).unwrap(),
1389            largest_prime_in_arithmetic_progression64(1 << 16, 1, 0, 1 << 61).unwrap(),
1390            largest_prime_in_arithmetic_progression64(1 << 16, 1, 0, 1 << 62).unwrap(),
1391            largest_prime_in_arithmetic_progression64(1 << 16, 1, 0, 1 << 63).unwrap(),
1392            largest_prime_in_arithmetic_progression64(1 << 16, 1, 0, u64::MAX).unwrap(),
1393        ] {
1394            let polynomial_size = 128;
1395
1396            let mut acc = (0..polynomial_size)
1397                .map(|_| rand::random::<u64>() % p)
1398                .collect::<Vec<_>>();
1399            let mut acc_target = acc.clone();
1400            let lhs = (0..polynomial_size)
1401                .map(|_| rand::random::<u64>() % p)
1402                .collect::<Vec<_>>();
1403            let rhs = (0..polynomial_size)
1404                .map(|_| rand::random::<u64>() % p)
1405                .collect::<Vec<_>>();
1406
1407            let mul = |a: u64, b: u64| ((a as u128 * b as u128) % p as u128) as u64;
1408            let add = |a: u64, b: u64| ((a as u128 + b as u128) % p as u128) as u64;
1409
1410            for (acc, lhs, rhs) in crate::izip!(&mut acc_target, &lhs, &rhs) {
1411                *acc = add(mul(*lhs, *rhs), *acc);
1412            }
1413
1414            Plan::try_new(polynomial_size, p)
1415                .unwrap()
1416                .mul_accumulate(&mut acc, &lhs, &rhs);
1417            assert_eq!(acc, acc_target);
1418        }
1419    }
1420
1421    #[test]
1422    fn test_mul_assign_normalize() {
1423        for p in [
1424            largest_prime_in_arithmetic_progression64(1 << 16, 1, 0, 1 << 51).unwrap(),
1425            largest_prime_in_arithmetic_progression64(1 << 16, 1, 0, 1 << 61).unwrap(),
1426            largest_prime_in_arithmetic_progression64(1 << 16, 1, 0, 1 << 62).unwrap(),
1427            largest_prime_in_arithmetic_progression64(1 << 16, 1, 0, 1 << 63).unwrap(),
1428            largest_prime_in_arithmetic_progression64(1 << 16, 1, 0, u64::MAX).unwrap(),
1429        ] {
1430            let polynomial_size = 128;
1431            let p_div = Div64::new(p);
1432            let n_inv_mod_p = crate::prime::exp_mod64(p_div, polynomial_size as u64, p - 2);
1433
1434            let mut lhs = (0..polynomial_size)
1435                .map(|_| rand::random::<u64>() % p)
1436                .collect::<Vec<_>>();
1437            let mut lhs_target = lhs.clone();
1438            let rhs = (0..polynomial_size)
1439                .map(|_| rand::random::<u64>() % p)
1440                .collect::<Vec<_>>();
1441
1442            let mul = |a: u64, b: u64| ((a as u128 * b as u128) % p as u128) as u64;
1443
1444            for (lhs, rhs) in lhs_target.iter_mut().zip(&rhs) {
1445                *lhs = mul(mul(*lhs, *rhs), n_inv_mod_p);
1446            }
1447
1448            Plan::try_new(polynomial_size, p)
1449                .unwrap()
1450                .mul_assign_normalize(&mut lhs, &rhs);
1451            assert_eq!(lhs, lhs_target);
1452        }
1453    }
1454
1455    #[test]
1456    fn test_normalize() {
1457        for p in [
1458            largest_prime_in_arithmetic_progression64(1 << 16, 1, 0, 1 << 51).unwrap(),
1459            largest_prime_in_arithmetic_progression64(1 << 16, 1, 0, 1 << 61).unwrap(),
1460            largest_prime_in_arithmetic_progression64(1 << 16, 1, 0, 1 << 62).unwrap(),
1461            largest_prime_in_arithmetic_progression64(1 << 16, 1, 0, 1 << 63).unwrap(),
1462            largest_prime_in_arithmetic_progression64(1 << 16, 1, 0, u64::MAX).unwrap(),
1463        ] {
1464            let polynomial_size = 128;
1465            let p_div = Div64::new(p);
1466            let n_inv_mod_p = crate::prime::exp_mod64(p_div, polynomial_size as u64, p - 2);
1467
1468            let mut val = (0..polynomial_size)
1469                .map(|_| rand::random::<u64>() % p)
1470                .collect::<Vec<_>>();
1471            let mut val_target = val.clone();
1472
1473            let mul = |a: u64, b: u64| ((a as u128 * b as u128) % p as u128) as u64;
1474
1475            for val in &mut val_target {
1476                *val = mul(*val, n_inv_mod_p);
1477            }
1478
1479            Plan::try_new(polynomial_size, p)
1480                .unwrap()
1481                .normalize(&mut val);
1482            assert_eq!(val, val_target);
1483        }
1484    }
1485}
1486
1487#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
1488#[cfg(test)]
1489mod x86_tests {
1490    use super::*;
1491    use crate::prime::largest_prime_in_arithmetic_progression64;
1492    use alloc::vec::Vec;
1493    use rand::random as rnd;
1494
1495    extern crate alloc;
1496
1497    #[test]
1498    fn test_interleaves_and_permutes_u64x4() {
1499        if let Some(simd) = crate::V3::try_new() {
1500            let a = u64x4(rnd(), rnd(), rnd(), rnd());
1501            let b = u64x4(rnd(), rnd(), rnd(), rnd());
1502
1503            assert_eq!(
1504                simd.interleave2_u64x4([a, b]),
1505                [u64x4(a.0, a.1, b.0, b.1), u64x4(a.2, a.3, b.2, b.3)],
1506            );
1507            assert_eq!(
1508                simd.interleave2_u64x4(simd.interleave2_u64x4([a, b])),
1509                [a, b],
1510            );
1511            let w = [rnd(), rnd()];
1512            assert_eq!(simd.permute2_u64x4(w), u64x4(w[0], w[0], w[1], w[1]));
1513
1514            assert_eq!(
1515                simd.interleave1_u64x4([a, b]),
1516                [u64x4(a.0, b.0, a.2, b.2), u64x4(a.1, b.1, a.3, b.3)],
1517            );
1518            assert_eq!(
1519                simd.interleave1_u64x4(simd.interleave1_u64x4([a, b])),
1520                [a, b],
1521            );
1522            let w = [rnd(), rnd(), rnd(), rnd()];
1523            assert_eq!(simd.permute1_u64x4(w), u64x4(w[0], w[2], w[1], w[3]));
1524        }
1525    }
1526
1527    #[cfg(feature = "nightly")]
1528    #[test]
1529    fn test_interleaves_and_permutes_u64x8() {
1530        if let Some(simd) = crate::V4::try_new() {
1531            let a = u64x8(rnd(), rnd(), rnd(), rnd(), rnd(), rnd(), rnd(), rnd());
1532            let b = u64x8(rnd(), rnd(), rnd(), rnd(), rnd(), rnd(), rnd(), rnd());
1533
1534            assert_eq!(
1535                simd.interleave4_u64x8([a, b]),
1536                [
1537                    u64x8(a.0, a.1, a.2, a.3, b.0, b.1, b.2, b.3),
1538                    u64x8(a.4, a.5, a.6, a.7, b.4, b.5, b.6, b.7),
1539                ],
1540            );
1541            assert_eq!(
1542                simd.interleave4_u64x8(simd.interleave4_u64x8([a, b])),
1543                [a, b],
1544            );
1545            let w = [rnd(), rnd()];
1546            assert_eq!(
1547                simd.permute4_u64x8(w),
1548                u64x8(w[0], w[0], w[0], w[0], w[1], w[1], w[1], w[1]),
1549            );
1550
1551            assert_eq!(
1552                simd.interleave2_u64x8([a, b]),
1553                [
1554                    u64x8(a.0, a.1, b.0, b.1, a.4, a.5, b.4, b.5),
1555                    u64x8(a.2, a.3, b.2, b.3, a.6, a.7, b.6, b.7),
1556                ],
1557            );
1558            assert_eq!(
1559                simd.interleave2_u64x8(simd.interleave2_u64x8([a, b])),
1560                [a, b],
1561            );
1562            let w = [rnd(), rnd(), rnd(), rnd()];
1563            assert_eq!(
1564                simd.permute2_u64x8(w),
1565                u64x8(w[0], w[0], w[2], w[2], w[1], w[1], w[3], w[3]),
1566            );
1567
1568            assert_eq!(
1569                simd.interleave1_u64x8([a, b]),
1570                [
1571                    u64x8(a.0, b.0, a.2, b.2, a.4, b.4, a.6, b.6),
1572                    u64x8(a.1, b.1, a.3, b.3, a.5, b.5, a.7, b.7),
1573                ],
1574            );
1575            assert_eq!(
1576                simd.interleave1_u64x8(simd.interleave1_u64x8([a, b])),
1577                [a, b],
1578            );
1579            let w = [rnd(), rnd(), rnd(), rnd(), rnd(), rnd(), rnd(), rnd()];
1580            assert_eq!(
1581                simd.permute1_u64x8(w),
1582                u64x8(w[0], w[4], w[1], w[5], w[2], w[6], w[3], w[7]),
1583            );
1584        }
1585    }
1586
1587    #[cfg(feature = "nightly")]
1588    #[test]
1589    fn test_mul_assign_normalize_ifma() {
1590        if let Some(simd) = crate::V4IFma::try_new() {
1591            let p =
1592                largest_prime_in_arithmetic_progression64(1 << 16, 1, 1 << 50, 1 << 51).unwrap();
1593            let p_div = Div64::new(p);
1594            let polynomial_size = 128;
1595
1596            let n_inv_mod_p = crate::prime::exp_mod64(p_div, polynomial_size as u64, p - 2);
1597            let n_inv_mod_p_shoup = (((n_inv_mod_p as u128) << 52) / p as u128) as u64;
1598            let big_q = (p.ilog2() + 1) as u64;
1599            let big_l = big_q + 51;
1600            let p_barrett = ((1u128 << big_l) / p as u128) as u64;
1601
1602            let mut lhs = (0..polynomial_size)
1603                .map(|_| rand::random::<u64>() % p)
1604                .collect::<Vec<_>>();
1605            let mut lhs_target = lhs.clone();
1606            let rhs = (0..polynomial_size)
1607                .map(|_| rand::random::<u64>() % p)
1608                .collect::<Vec<_>>();
1609
1610            let mul = |a: u64, b: u64| ((a as u128 * b as u128) % p as u128) as u64;
1611
1612            for (lhs, rhs) in lhs_target.iter_mut().zip(&rhs) {
1613                *lhs = mul(mul(*lhs, *rhs), n_inv_mod_p);
1614            }
1615
1616            mul_assign_normalize_ifma(
1617                simd,
1618                &mut lhs,
1619                &rhs,
1620                p,
1621                p_barrett,
1622                big_q,
1623                n_inv_mod_p,
1624                n_inv_mod_p_shoup,
1625            );
1626            assert_eq!(lhs, lhs_target);
1627        }
1628    }
1629
1630    #[cfg(feature = "nightly")]
1631    #[test]
1632    fn test_mul_assign_normalize_avx512() {
1633        if let Some(simd) = crate::V4::try_new() {
1634            let p =
1635                largest_prime_in_arithmetic_progression64(1 << 16, 1, 1 << 50, 1 << 63).unwrap();
1636            let p_div = Div64::new(p);
1637            let polynomial_size = 128;
1638
1639            let n_inv_mod_p = crate::prime::exp_mod64(p_div, polynomial_size as u64, p - 2);
1640            let n_inv_mod_p_shoup = (((n_inv_mod_p as u128) << 64) / p as u128) as u64;
1641            let big_q = (p.ilog2() + 1) as u64;
1642            let big_l = big_q + 63;
1643            let p_barrett = ((1u128 << big_l) / p as u128) as u64;
1644
1645            let mut lhs = (0..polynomial_size)
1646                .map(|_| rand::random::<u64>() % p)
1647                .collect::<Vec<_>>();
1648            let mut lhs_target = lhs.clone();
1649            let rhs = (0..polynomial_size)
1650                .map(|_| rand::random::<u64>() % p)
1651                .collect::<Vec<_>>();
1652
1653            let mul = |a: u64, b: u64| ((a as u128 * b as u128) % p as u128) as u64;
1654
1655            for (lhs, rhs) in lhs_target.iter_mut().zip(&rhs) {
1656                *lhs = mul(mul(*lhs, *rhs), n_inv_mod_p);
1657            }
1658
1659            mul_assign_normalize_avx512(
1660                simd,
1661                &mut lhs,
1662                &rhs,
1663                p,
1664                p_barrett,
1665                big_q,
1666                n_inv_mod_p,
1667                n_inv_mod_p_shoup,
1668            );
1669            assert_eq!(lhs, lhs_target);
1670        }
1671    }
1672
1673    #[test]
1674    fn test_mul_assign_normalize_avx2() {
1675        if let Some(simd) = crate::V3::try_new() {
1676            let p =
1677                largest_prime_in_arithmetic_progression64(1 << 16, 1, 1 << 50, 1 << 63).unwrap();
1678            let p_div = Div64::new(p);
1679            let polynomial_size = 128;
1680
1681            let n_inv_mod_p = crate::prime::exp_mod64(p_div, polynomial_size as u64, p - 2);
1682            let n_inv_mod_p_shoup = (((n_inv_mod_p as u128) << 64) / p as u128) as u64;
1683            let big_q = (p.ilog2() + 1) as u64;
1684            let big_l = big_q + 63;
1685            let p_barrett = ((1u128 << big_l) / p as u128) as u64;
1686
1687            let mut lhs = (0..polynomial_size)
1688                .map(|_| rand::random::<u64>() % p)
1689                .collect::<Vec<_>>();
1690            let mut lhs_target = lhs.clone();
1691            let rhs = (0..polynomial_size)
1692                .map(|_| rand::random::<u64>() % p)
1693                .collect::<Vec<_>>();
1694
1695            let mul = |a: u64, b: u64| ((a as u128 * b as u128) % p as u128) as u64;
1696
1697            for (lhs, rhs) in lhs_target.iter_mut().zip(&rhs) {
1698                *lhs = mul(mul(*lhs, *rhs), n_inv_mod_p);
1699            }
1700
1701            mul_assign_normalize_avx2(
1702                simd,
1703                &mut lhs,
1704                &rhs,
1705                p,
1706                p_barrett,
1707                big_q,
1708                n_inv_mod_p,
1709                n_inv_mod_p_shoup,
1710            );
1711            assert_eq!(lhs, lhs_target);
1712        }
1713    }
1714
1715    #[cfg(feature = "nightly")]
1716    #[test]
1717    fn test_mul_accumulate_ifma() {
1718        if let Some(simd) = crate::V4IFma::try_new() {
1719            let p =
1720                largest_prime_in_arithmetic_progression64(1 << 16, 1, 1 << 50, 1 << 51).unwrap();
1721            let polynomial_size = 128;
1722
1723            let big_q = (p.ilog2() + 1) as u64;
1724            let big_l = big_q + 51;
1725            let p_barrett = ((1u128 << big_l) / p as u128) as u64;
1726
1727            let mut acc = (0..polynomial_size)
1728                .map(|_| rand::random::<u64>() % p)
1729                .collect::<Vec<_>>();
1730            let mut acc_target = acc.clone();
1731            let lhs = (0..polynomial_size)
1732                .map(|_| rand::random::<u64>() % p)
1733                .collect::<Vec<_>>();
1734            let rhs = (0..polynomial_size)
1735                .map(|_| rand::random::<u64>() % p)
1736                .collect::<Vec<_>>();
1737
1738            let mul = |a: u64, b: u64| ((a as u128 * b as u128) % p as u128) as u64;
1739            let add = |a: u64, b: u64| <u64 as PrimeModulus>::add(p, a, b);
1740
1741            for (acc, lhs, rhs) in crate::izip!(&mut acc_target, &lhs, &rhs) {
1742                *acc = add(mul(*lhs, *rhs), *acc);
1743            }
1744
1745            mul_accumulate_ifma(simd, &mut acc, &lhs, &rhs, p, p_barrett, big_q);
1746            assert_eq!(acc, acc_target);
1747        }
1748    }
1749
1750    #[cfg(feature = "nightly")]
1751    #[test]
1752    fn test_mul_accumulate_avx512() {
1753        if let Some(simd) = crate::V4::try_new() {
1754            let p =
1755                largest_prime_in_arithmetic_progression64(1 << 16, 1, 1 << 50, 1 << 63).unwrap();
1756            let polynomial_size = 128;
1757
1758            let big_q = (p.ilog2() + 1) as u64;
1759            let big_l = big_q + 63;
1760            let p_barrett = ((1u128 << big_l) / p as u128) as u64;
1761
1762            let mut acc = (0..polynomial_size)
1763                .map(|_| rand::random::<u64>() % p)
1764                .collect::<Vec<_>>();
1765            let mut acc_target = acc.clone();
1766            let lhs = (0..polynomial_size)
1767                .map(|_| rand::random::<u64>() % p)
1768                .collect::<Vec<_>>();
1769            let rhs = (0..polynomial_size)
1770                .map(|_| rand::random::<u64>() % p)
1771                .collect::<Vec<_>>();
1772
1773            let mul = |a: u64, b: u64| ((a as u128 * b as u128) % p as u128) as u64;
1774            let add = |a: u64, b: u64| <u64 as PrimeModulus>::add(p, a, b);
1775
1776            for (acc, lhs, rhs) in crate::izip!(&mut acc_target, &lhs, &rhs) {
1777                *acc = add(mul(*lhs, *rhs), *acc);
1778            }
1779
1780            mul_accumulate_avx512(simd, &mut acc, &lhs, &rhs, p, p_barrett, big_q);
1781            assert_eq!(acc, acc_target);
1782        }
1783    }
1784
1785    #[test]
1786    fn test_mul_accumulate_avx2() {
1787        if let Some(simd) = crate::V3::try_new() {
1788            let p =
1789                largest_prime_in_arithmetic_progression64(1 << 16, 1, 1 << 50, 1 << 63).unwrap();
1790            let polynomial_size = 128;
1791
1792            let big_q = (p.ilog2() + 1) as u64;
1793            let big_l = big_q + 63;
1794            let p_barrett = ((1u128 << big_l) / p as u128) as u64;
1795
1796            let mut acc = (0..polynomial_size)
1797                .map(|_| rand::random::<u64>() % p)
1798                .collect::<Vec<_>>();
1799            let mut acc_target = acc.clone();
1800            let lhs = (0..polynomial_size)
1801                .map(|_| rand::random::<u64>() % p)
1802                .collect::<Vec<_>>();
1803            let rhs = (0..polynomial_size)
1804                .map(|_| rand::random::<u64>() % p)
1805                .collect::<Vec<_>>();
1806
1807            let mul = |a: u64, b: u64| ((a as u128 * b as u128) % p as u128) as u64;
1808            let add = |a: u64, b: u64| <u64 as PrimeModulus>::add(p, a, b);
1809
1810            for (acc, lhs, rhs) in crate::izip!(&mut acc_target, &lhs, &rhs) {
1811                *acc = add(mul(*lhs, *rhs), *acc);
1812            }
1813
1814            mul_accumulate_avx2(simd, &mut acc, &lhs, &rhs, p, p_barrett, big_q);
1815            assert_eq!(acc, acc_target);
1816        }
1817    }
1818
1819    #[cfg(feature = "nightly")]
1820    #[test]
1821    fn test_normalize_ifma() {
1822        if let Some(simd) = crate::V4IFma::try_new() {
1823            let p =
1824                largest_prime_in_arithmetic_progression64(1 << 16, 1, 1 << 50, 1 << 51).unwrap();
1825            let p_div = Div64::new(p);
1826            let polynomial_size = 128;
1827
1828            let n_inv_mod_p = crate::prime::exp_mod64(p_div, polynomial_size as u64, p - 2);
1829            let n_inv_mod_p_shoup = (((n_inv_mod_p as u128) << 52) / p as u128) as u64;
1830
1831            let mut val = (0..polynomial_size)
1832                .map(|_| rand::random::<u64>() % p)
1833                .collect::<Vec<_>>();
1834            let mut val_target = val.clone();
1835
1836            let mul = |a: u64, b: u64| ((a as u128 * b as u128) % p as u128) as u64;
1837
1838            for val in val_target.iter_mut() {
1839                *val = mul(*val, n_inv_mod_p);
1840            }
1841
1842            normalize_ifma(simd, &mut val, p, n_inv_mod_p, n_inv_mod_p_shoup);
1843            assert_eq!(val, val_target);
1844        }
1845    }
1846
1847    #[cfg(feature = "nightly")]
1848    #[test]
1849    fn test_normalize_avx512() {
1850        if let Some(simd) = crate::V4::try_new() {
1851            let p =
1852                largest_prime_in_arithmetic_progression64(1 << 16, 1, 1 << 50, 1 << 63).unwrap();
1853            let p_div = Div64::new(p);
1854            let polynomial_size = 128;
1855
1856            let n_inv_mod_p = crate::prime::exp_mod64(p_div, polynomial_size as u64, p - 2);
1857            let n_inv_mod_p_shoup = (((n_inv_mod_p as u128) << 64) / p as u128) as u64;
1858
1859            let mut val = (0..polynomial_size)
1860                .map(|_| rand::random::<u64>() % p)
1861                .collect::<Vec<_>>();
1862            let mut val_target = val.clone();
1863
1864            let mul = |a: u64, b: u64| ((a as u128 * b as u128) % p as u128) as u64;
1865
1866            for val in val_target.iter_mut() {
1867                *val = mul(*val, n_inv_mod_p);
1868            }
1869
1870            normalize_avx512(simd, &mut val, p, n_inv_mod_p, n_inv_mod_p_shoup);
1871            assert_eq!(val, val_target);
1872        }
1873    }
1874
1875    #[test]
1876    fn test_normalize_avx2() {
1877        if let Some(simd) = crate::V3::try_new() {
1878            let p =
1879                largest_prime_in_arithmetic_progression64(1 << 16, 1, 1 << 50, 1 << 63).unwrap();
1880            let p_div = Div64::new(p);
1881            let polynomial_size = 128;
1882
1883            let n_inv_mod_p = crate::prime::exp_mod64(p_div, polynomial_size as u64, p - 2);
1884            let n_inv_mod_p_shoup = (((n_inv_mod_p as u128) << 64) / p as u128) as u64;
1885
1886            let mut val = (0..polynomial_size)
1887                .map(|_| rand::random::<u64>() % p)
1888                .collect::<Vec<_>>();
1889            let mut val_target = val.clone();
1890
1891            let mul = |a: u64, b: u64| ((a as u128 * b as u128) % p as u128) as u64;
1892
1893            for val in val_target.iter_mut() {
1894                *val = mul(*val, n_inv_mod_p);
1895            }
1896
1897            normalize_avx2(simd, &mut val, p, n_inv_mod_p, n_inv_mod_p_shoup);
1898            assert_eq!(val, val_target);
1899        }
1900    }
1901
1902    #[test]
1903    fn test_plan_crash_github_11() {
1904        assert!(Plan::try_new(2048, 1024).is_none());
1905    }
1906}