1use crate::{bit_rev, fastdiv::Div64, prime::is_prime64, roots::find_primitive_root64};
2use aligned_vec::{avec, ABox};
3
4#[allow(unused_imports)]
5use pulp::*;
6
7const RECURSION_THRESHOLD: usize = 1024;
8pub(crate) const SOLINAS_PRIME: u64 = ((1_u128 << 64) - (1_u128 << 32) + 1) as u64;
9
10mod generic_solinas;
11mod shoup;
12
13#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
14#[cfg(feature = "nightly")]
15mod less_than_50bit;
16#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
17#[cfg(feature = "nightly")]
18mod less_than_51bit;
19
20mod less_than_62bit;
21mod less_than_63bit;
22
23use self::generic_solinas::PrimeModulus;
24use crate::roots::find_root_solinas_64;
25pub use generic_solinas::Solinas;
26
27#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
28impl crate::V3 {
29 #[inline(always)]
30 fn interleave2_u64x4(self, z0z0z1z1: [u64x4; 2]) -> [u64x4; 2] {
31 let avx = self.avx;
32 [
33 cast(
34 avx._mm256_permute2f128_si256::<0b0010_0000>(cast(z0z0z1z1[0]), cast(z0z0z1z1[1])),
35 ),
36 cast(
37 avx._mm256_permute2f128_si256::<0b0011_0001>(cast(z0z0z1z1[0]), cast(z0z0z1z1[1])),
38 ),
39 ]
40 }
41
42 #[inline(always)]
43 fn permute2_u64x4(self, w: [u64; 2]) -> u64x4 {
44 let avx = self.avx;
45 let w00 = self.sse2._mm_set1_epi64x(w[0] as _);
46 let w11 = self.sse2._mm_set1_epi64x(w[1] as _);
47 cast(avx._mm256_insertf128_si256::<0b1>(avx._mm256_castsi128_si256(w00), w11))
48 }
49
50 #[inline(always)]
51 fn interleave1_u64x4(self, z0z1: [u64x4; 2]) -> [u64x4; 2] {
52 let avx = self.avx2;
53 [
54 cast(avx._mm256_unpacklo_epi64(cast(z0z1[0]), cast(z0z1[1]))),
55 cast(avx._mm256_unpackhi_epi64(cast(z0z1[0]), cast(z0z1[1]))),
56 ]
57 }
58
59 #[inline(always)]
60 fn permute1_u64x4(self, w: [u64; 4]) -> u64x4 {
61 let avx = self.avx;
62 let w0123 = pulp::cast(w);
63 let w0101 = avx._mm256_permute2f128_si256::<0b0000_0000>(w0123, w0123);
64 let w2323 = avx._mm256_permute2f128_si256::<0b0011_0011>(w0123, w0123);
65 cast(avx._mm256_castpd_si256(avx._mm256_shuffle_pd::<0b1100>(
66 avx._mm256_castsi256_pd(w0101),
67 avx._mm256_castsi256_pd(w2323),
68 )))
69 }
70
71 #[inline(always)]
72 pub fn small_mod_u64x4(self, modulus: u64x4, x: u64x4) -> u64x4 {
73 self.select_u64x4(
74 self.cmp_gt_u64x4(modulus, x),
75 x,
76 self.wrapping_sub_u64x4(x, modulus),
77 )
78 }
79}
80
81#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
82#[cfg(feature = "nightly")]
83impl crate::V4 {
84 #[inline(always)]
85 fn interleave4_u64x8(self, z0z0z0z0z1z1z1z1: [u64x8; 2]) -> [u64x8; 2] {
86 let avx = self.avx512f;
87 let idx_0 = avx._mm512_setr_epi64(0x0, 0x1, 0x2, 0x3, 0x8, 0x9, 0xa, 0xb);
88 let idx_1 = avx._mm512_setr_epi64(0x4, 0x5, 0x6, 0x7, 0xc, 0xd, 0xe, 0xf);
89 [
90 cast(avx._mm512_permutex2var_epi64(
91 cast(z0z0z0z0z1z1z1z1[0]),
92 idx_0,
93 cast(z0z0z0z0z1z1z1z1[1]),
94 )),
95 cast(avx._mm512_permutex2var_epi64(
96 cast(z0z0z0z0z1z1z1z1[0]),
97 idx_1,
98 cast(z0z0z0z0z1z1z1z1[1]),
99 )),
100 ]
101 }
102
103 #[inline(always)]
104 fn permute4_u64x8(self, w: [u64; 2]) -> u64x8 {
105 let avx = self.avx512f;
106 let w = pulp::cast(w);
107 let w01xxxxxx = avx._mm512_castsi128_si512(w);
108 let idx = avx._mm512_setr_epi64(0, 0, 0, 0, 1, 1, 1, 1);
109 cast(avx._mm512_permutexvar_epi64(idx, w01xxxxxx))
110 }
111
112 #[inline(always)]
113 fn interleave2_u64x8(self, z0z0z1z1: [u64x8; 2]) -> [u64x8; 2] {
114 let avx = self.avx512f;
115 let idx_0 = avx._mm512_setr_epi64(0x0, 0x1, 0x8, 0x9, 0x4, 0x5, 0xc, 0xd);
116 let idx_1 = avx._mm512_setr_epi64(0x2, 0x3, 0xa, 0xb, 0x6, 0x7, 0xe, 0xf);
117 [
118 cast(avx._mm512_permutex2var_epi64(cast(z0z0z1z1[0]), idx_0, cast(z0z0z1z1[1]))),
119 cast(avx._mm512_permutex2var_epi64(cast(z0z0z1z1[0]), idx_1, cast(z0z0z1z1[1]))),
120 ]
121 }
122
123 #[inline(always)]
124 fn permute2_u64x8(self, w: [u64; 4]) -> u64x8 {
125 let avx = self.avx512f;
126 let w = pulp::cast(w);
127 let w0123xxxx = avx._mm512_castsi256_si512(w);
128 let idx = avx._mm512_setr_epi64(0, 0, 2, 2, 1, 1, 3, 3);
129 cast(avx._mm512_permutexvar_epi64(idx, w0123xxxx))
130 }
131
132 #[inline(always)]
133 fn interleave1_u64x8(self, z0z1: [u64x8; 2]) -> [u64x8; 2] {
134 let avx = self.avx512f;
135 [
136 cast(avx._mm512_unpacklo_epi64(cast(z0z1[0]), cast(z0z1[1]))),
137 cast(avx._mm512_unpackhi_epi64(cast(z0z1[0]), cast(z0z1[1]))),
138 ]
139 }
140
141 #[inline(always)]
142 fn permute1_u64x8(self, w: [u64; 8]) -> u64x8 {
143 let avx = self.avx512f;
144 let w = pulp::cast(w);
145 let idx = avx._mm512_setr_epi64(0, 4, 1, 5, 2, 6, 3, 7);
146 cast(avx._mm512_permutexvar_epi64(idx, w))
147 }
148
149 #[inline(always)]
150 pub fn small_mod_u64x8(self, modulus: u64x8, x: u64x8) -> u64x8 {
151 self.select_u64x8(
152 self.cmp_gt_u64x8(modulus, x),
153 x,
154 self.wrapping_sub_u64x8(x, modulus),
155 )
156 }
157}
158
159fn init_negacyclic_twiddles(p: u64, n: usize, twid: &mut [u64], inv_twid: &mut [u64]) {
160 let div = Div64::new(p);
161
162 let w = if p == SOLINAS_PRIME {
163 match n {
167 32 => 8_u64,
168 64 => 2198989700608_u64,
169 128 => 14041890976876060974_u64,
170 256 => 14430643036723656017_u64,
171 512 => 4440654710286119610_u64,
172 1024 => 8816101479115663336_u64,
173 2048 => 10974926054405199669_u64,
174 4096 => 1206500561358145487_u64,
175 8192 => 10930245224889659871_u64,
176 16384 => 3333600369887534767_u64,
177 32768 => 15893793146607301539_u64,
178 _ => find_root_solinas_64(div, 2 * n as u64).unwrap(),
179 }
180 } else {
181 find_primitive_root64(div, 2 * n as u64).unwrap()
182 };
183
184 let mut k = 0;
185 let mut wk = 1u64;
186
187 let nbits = n.trailing_zeros();
188 while k < n {
189 let fwd_idx = bit_rev(nbits, k);
190
191 twid[fwd_idx] = wk;
192
193 let inv_idx = bit_rev(nbits, (n - k) % n);
194 if k == 0 {
195 inv_twid[inv_idx] = wk;
196 } else {
197 let x = p.wrapping_sub(wk);
198 inv_twid[inv_idx] = x;
199 }
200
201 wk = Div64::rem_u128(wk as u128 * w as u128, div);
202 k += 1;
203 }
204}
205
206fn init_negacyclic_twiddles_shoup(
207 p: u64,
208 n: usize,
209 max_bits: u32,
210 twid: &mut [u64],
211 twid_shoup: &mut [u64],
212 inv_twid: &mut [u64],
213 inv_twid_shoup: &mut [u64],
214) {
215 let div = Div64::new(p);
216 let w = find_primitive_root64(div, 2 * n as u64).unwrap();
217 let mut k = 0;
218 let mut wk = 1u64;
219
220 let nbits = n.trailing_zeros();
221 while k < n {
222 let fwd_idx = bit_rev(nbits, k);
223
224 let wk_shoup = Div64::div_u128((wk as u128) << max_bits, div) as u64;
225 twid[fwd_idx] = wk;
226 twid_shoup[fwd_idx] = wk_shoup;
227
228 let inv_idx = bit_rev(nbits, (n - k) % n);
229 if k == 0 {
230 inv_twid[inv_idx] = wk;
231 inv_twid_shoup[inv_idx] = wk_shoup;
232 } else {
233 let x = p.wrapping_sub(wk);
234 inv_twid[inv_idx] = x;
235 inv_twid_shoup[inv_idx] = Div64::div_u128((x as u128) << max_bits, div) as u64;
236 }
237
238 wk = Div64::rem_u128(wk as u128 * w as u128, div);
239 k += 1;
240 }
241}
242
243#[derive(Clone)]
245pub struct Plan {
246 twid: ABox<[u64]>,
247 twid_shoup: ABox<[u64]>,
248 inv_twid: ABox<[u64]>,
249 inv_twid_shoup: ABox<[u64]>,
250 p: u64,
251 p_div: Div64,
252
253 p_barrett: u64,
255 big_q: u64,
256
257 n_inv_mod_p: u64,
258 n_inv_mod_p_shoup: u64,
259}
260
261impl core::fmt::Debug for Plan {
262 fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
263 f.debug_struct("Plan")
264 .field("ntt_size", &self.ntt_size())
265 .field("modulus", &self.modulus())
266 .finish()
267 }
268}
269
270#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
271#[cfg(feature = "nightly")]
272fn mul_assign_normalize_ifma(
273 simd: crate::V4IFma,
274 lhs: &mut [u64],
275 rhs: &[u64],
276 p: u64,
277 p_barrett: u64,
278 big_q: u64,
279 n_inv_mod_p: u64,
280 n_inv_mod_p_shoup: u64,
281) {
282 simd.vectorize(
283 #[inline(always)]
284 || {
285 let lhs = pulp::as_arrays_mut::<8, _>(lhs).0;
286 let rhs = pulp::as_arrays::<8, _>(rhs).0;
287
288 let big_q_m1 = simd.splat_u64x8(big_q - 1);
289 let big_q_m1_complement = simd.splat_u64x8(52 - (big_q - 1));
290 let n_inv_mod_p = simd.splat_u64x8(n_inv_mod_p);
291 let n_inv_mod_p_shoup = simd.splat_u64x8(n_inv_mod_p_shoup);
292 let p_barrett = simd.splat_u64x8(p_barrett);
293 let neg_p = simd.splat_u64x8(p.wrapping_neg());
294 let p = simd.splat_u64x8(p);
295 let zero = simd.splat_u64x8(0);
296
297 for (lhs_, rhs) in crate::izip!(lhs, rhs) {
298 let lhs = cast(*lhs_);
299 let rhs = cast(*rhs);
300
301 let (lo, hi) = simd.widening_mul_u52x8(lhs, rhs);
303 let c1 = simd.or_u64x8(
304 simd.shr_dyn_u64x8(lo, big_q_m1),
305 simd.shl_dyn_u64x8(hi, big_q_m1_complement),
306 );
307 let c3 = simd.widening_mul_u52x8(c1, p_barrett).1;
308 let prod = simd.wrapping_mul_add_u52x8(neg_p, c3, lo);
310
311 let shoup_q = simd.widening_mul_u52x8(prod, n_inv_mod_p_shoup).1;
313 let t = simd.wrapping_mul_add_u52x8(
314 shoup_q,
315 neg_p,
316 simd.wrapping_mul_add_u52x8(prod, n_inv_mod_p, zero),
317 );
318
319 *lhs_ = cast(simd.small_mod_u64x8(p, t));
320 }
321 },
322 )
323}
324
325#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
326#[cfg(feature = "nightly")]
327fn mul_accumulate_ifma(
328 simd: crate::V4IFma,
329 acc: &mut [u64],
330 lhs: &[u64],
331 rhs: &[u64],
332 p: u64,
333 p_barrett: u64,
334 big_q: u64,
335) {
336 simd.vectorize(
337 #[inline(always)]
338 || {
339 let acc = pulp::as_arrays_mut::<8, _>(acc).0;
340 let lhs = pulp::as_arrays::<8, _>(lhs).0;
341 let rhs = pulp::as_arrays::<8, _>(rhs).0;
342
343 let big_q_m1 = simd.splat_u64x8(big_q - 1);
344 let big_q_m1_complement = simd.splat_u64x8(52 - (big_q - 1));
345 let p_barrett = simd.splat_u64x8(p_barrett);
346 let neg_p = simd.splat_u64x8(p.wrapping_neg());
347 let p = simd.splat_u64x8(p);
348
349 for (acc, lhs, rhs) in crate::izip!(acc, lhs, rhs) {
350 let lhs = cast(*lhs);
351 let rhs = cast(*rhs);
352
353 let (lo, hi) = simd.widening_mul_u52x8(lhs, rhs);
355 let c1 = simd.or_u64x8(
356 simd.shr_dyn_u64x8(lo, big_q_m1),
357 simd.shl_dyn_u64x8(hi, big_q_m1_complement),
358 );
359 let c3 = simd.widening_mul_u52x8(c1, p_barrett).1;
360 let prod = simd.wrapping_mul_add_u52x8(neg_p, c3, lo);
362 let prod = simd.small_mod_u64x8(p, prod);
363
364 *acc = cast(simd.small_mod_u64x8(p, simd.wrapping_add_u64x8(prod, cast(*acc))));
365 }
366 },
367 )
368}
369
370#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
371#[cfg(feature = "nightly")]
372fn mul_assign_normalize_avx512(
373 simd: crate::V4,
374 lhs: &mut [u64],
375 rhs: &[u64],
376 p: u64,
377 p_barrett: u64,
378 big_q: u64,
379 n_inv_mod_p: u64,
380 n_inv_mod_p_shoup: u64,
381) {
382 simd.vectorize(
383 #[inline(always)]
384 move || {
385 let lhs = pulp::as_arrays_mut::<8, _>(lhs).0;
386 let rhs = pulp::as_arrays::<8, _>(rhs).0;
387
388 let big_q_m1 = simd.splat_u64x8(big_q - 1);
389 let big_q_m1_complement = simd.splat_u64x8(64 - (big_q - 1));
390 let n_inv_mod_p = simd.splat_u64x8(n_inv_mod_p);
391 let n_inv_mod_p_shoup = simd.splat_u64x8(n_inv_mod_p_shoup);
392 let p_barrett = simd.splat_u64x8(p_barrett);
393 let p = simd.splat_u64x8(p);
394
395 for (lhs_, rhs) in crate::izip!(lhs, rhs) {
396 let lhs = cast(*lhs_);
397 let rhs = cast(*rhs);
398
399 let (lo, hi) = simd.widening_mul_u64x8(lhs, rhs);
401 let c1 = simd.or_u64x8(
402 simd.shr_dyn_u64x8(lo, big_q_m1),
403 simd.shl_dyn_u64x8(hi, big_q_m1_complement),
404 );
405 let c3 = simd.widening_mul_u64x8(c1, p_barrett).1;
406 let prod = simd.wrapping_sub_u64x8(lo, simd.wrapping_mul_u64x8(p, c3));
407
408 let shoup_q = simd.widening_mul_u64x8(prod, n_inv_mod_p_shoup).1;
410 let t = simd.wrapping_sub_u64x8(
411 simd.wrapping_mul_u64x8(prod, n_inv_mod_p),
412 simd.wrapping_mul_u64x8(shoup_q, p),
413 );
414
415 *lhs_ = cast(simd.small_mod_u64x8(p, t));
416 }
417 },
418 );
419}
420
421#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
422#[cfg(feature = "nightly")]
423fn mul_accumulate_avx512(
424 simd: crate::V4,
425 acc: &mut [u64],
426 lhs: &[u64],
427 rhs: &[u64],
428 p: u64,
429 p_barrett: u64,
430 big_q: u64,
431) {
432 simd.vectorize(
433 #[inline(always)]
434 || {
435 let acc = pulp::as_arrays_mut::<8, _>(acc).0;
436 let lhs = pulp::as_arrays::<8, _>(lhs).0;
437 let rhs = pulp::as_arrays::<8, _>(rhs).0;
438
439 let big_q_m1 = simd.splat_u64x8(big_q - 1);
440 let big_q_m1_complement = simd.splat_u64x8(64 - (big_q - 1));
441 let p_barrett = simd.splat_u64x8(p_barrett);
442 let p = simd.splat_u64x8(p);
443
444 for (acc, lhs, rhs) in crate::izip!(acc, lhs, rhs) {
445 let lhs = cast(*lhs);
446 let rhs = cast(*rhs);
447
448 let (lo, hi) = simd.widening_mul_u64x8(lhs, rhs);
450 let c1 = simd.or_u64x8(
451 simd.shr_dyn_u64x8(lo, big_q_m1),
452 simd.shl_dyn_u64x8(hi, big_q_m1_complement),
453 );
454 let c3 = simd.widening_mul_u64x8(c1, p_barrett).1;
455 let prod = simd.wrapping_sub_u64x8(lo, simd.wrapping_mul_u64x8(p, c3));
457 let prod = simd.small_mod_u64x8(p, prod);
458
459 *acc = cast(simd.small_mod_u64x8(p, simd.wrapping_add_u64x8(prod, cast(*acc))));
460 }
461 },
462 )
463}
464
465#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
466fn mul_assign_normalize_avx2(
467 simd: crate::V3,
468 lhs: &mut [u64],
469 rhs: &[u64],
470 p: u64,
471 p_barrett: u64,
472 big_q: u64,
473 n_inv_mod_p: u64,
474 n_inv_mod_p_shoup: u64,
475) {
476 simd.vectorize(
477 #[inline(always)]
478 move || {
479 let lhs = pulp::as_arrays_mut::<4, _>(lhs).0;
480 let rhs = pulp::as_arrays::<4, _>(rhs).0;
481 let big_q_m1 = simd.splat_u64x4(big_q - 1);
482 let big_q_m1_complement = simd.splat_u64x4(64 - (big_q - 1));
483 let n_inv_mod_p = simd.splat_u64x4(n_inv_mod_p);
484 let n_inv_mod_p_shoup = simd.splat_u64x4(n_inv_mod_p_shoup);
485 let p_barrett = simd.splat_u64x4(p_barrett);
486 let p = simd.splat_u64x4(p);
487
488 for (lhs_, rhs) in crate::izip!(lhs, rhs) {
489 let lhs = cast(*lhs_);
490 let rhs = cast(*rhs);
491
492 let (lo, hi) = simd.widening_mul_u64x4(lhs, rhs);
494 let c1 = simd.or_u64x4(
495 simd.shr_dyn_u64x4(lo, big_q_m1),
496 simd.shl_dyn_u64x4(hi, big_q_m1_complement),
497 );
498 let c3 = simd.widening_mul_u64x4(c1, p_barrett).1;
499 let prod = simd.wrapping_sub_u64x4(lo, simd.widening_mul_u64x4(p, c3).0);
500
501 let shoup_q = simd.widening_mul_u64x4(prod, n_inv_mod_p_shoup).1;
503 let t = simd.wrapping_sub_u64x4(
504 simd.widening_mul_u64x4(prod, n_inv_mod_p).0,
505 simd.widening_mul_u64x4(shoup_q, p).0,
506 );
507
508 *lhs_ = cast(simd.small_mod_u64x4(p, t));
509 }
510 },
511 );
512}
513
514#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
515fn mul_accumulate_avx2(
516 simd: crate::V3,
517 acc: &mut [u64],
518 lhs: &[u64],
519 rhs: &[u64],
520 p: u64,
521 p_barrett: u64,
522 big_q: u64,
523) {
524 simd.vectorize(
525 #[inline(always)]
526 || {
527 let acc = pulp::as_arrays_mut::<4, _>(acc).0;
528 let lhs = pulp::as_arrays::<4, _>(lhs).0;
529 let rhs = pulp::as_arrays::<4, _>(rhs).0;
530
531 let big_q_m1 = simd.splat_u64x4(big_q - 1);
532 let big_q_m1_complement = simd.splat_u64x4(64 - (big_q - 1));
533 let p_barrett = simd.splat_u64x4(p_barrett);
534 let p = simd.splat_u64x4(p);
535
536 for (acc, lhs, rhs) in crate::izip!(acc, lhs, rhs) {
537 let lhs = cast(*lhs);
538 let rhs = cast(*rhs);
539
540 let (lo, hi) = simd.widening_mul_u64x4(lhs, rhs);
542 let c1 = simd.or_u64x4(
543 simd.shr_dyn_u64x4(lo, big_q_m1),
544 simd.shl_dyn_u64x4(hi, big_q_m1_complement),
545 );
546 let c3 = simd.widening_mul_u64x4(c1, p_barrett).1;
547 let prod = simd.wrapping_sub_u64x4(lo, simd.widening_mul_u64x4(p, c3).0);
549 let prod = simd.small_mod_u64x4(p, prod);
550
551 *acc = cast(simd.small_mod_u64x4(p, simd.wrapping_add_u64x4(prod, cast(*acc))));
552 }
553 },
554 )
555}
556
557fn mul_assign_normalize_scalar(
558 lhs: &mut [u64],
559 rhs: &[u64],
560 p: u64,
561 p_barrett: u64,
562 big_q: u64,
563 n_inv_mod_p: u64,
564 n_inv_mod_p_shoup: u64,
565) {
566 let big_q_m1 = big_q - 1;
567
568 for (lhs_, rhs) in crate::izip!(lhs, rhs) {
569 let lhs = *lhs_;
570 let rhs = *rhs;
571
572 let d = lhs as u128 * rhs as u128;
573 let c1 = (d >> big_q_m1) as u64;
574 let c3 = ((c1 as u128 * p_barrett as u128) >> 64) as u64;
575 let prod = (d as u64).wrapping_sub(p.wrapping_mul(c3));
576
577 let shoup_q = (((prod as u128) * (n_inv_mod_p_shoup as u128)) >> 64) as u64;
578 let t = u64::wrapping_sub(prod.wrapping_mul(n_inv_mod_p), shoup_q.wrapping_mul(p));
579
580 *lhs_ = t.min(t.wrapping_sub(p));
581 }
582}
583
584fn mul_accumulate_scalar(
585 acc: &mut [u64],
586 lhs: &[u64],
587 rhs: &[u64],
588 p: u64,
589 p_barrett: u64,
590 big_q: u64,
591) {
592 let big_q_m1 = big_q - 1;
593
594 for (acc, lhs, rhs) in crate::izip!(acc, lhs, rhs) {
595 let lhs = *lhs;
596 let rhs = *rhs;
597
598 let d = lhs as u128 * rhs as u128;
599 let c1 = (d >> big_q_m1) as u64;
600 let c3 = ((c1 as u128 * p_barrett as u128) >> 64) as u64;
601 let prod = (d as u64).wrapping_sub(p.wrapping_mul(c3));
602 let prod = prod.min(prod.wrapping_sub(p));
603
604 let acc_ = prod + *acc;
605 *acc = acc_.min(acc_.wrapping_sub(p));
606 }
607}
608
609#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
610#[cfg(feature = "nightly")]
611fn normalize_ifma(
612 simd: crate::V4IFma,
613 values: &mut [u64],
614 p: u64,
615 n_inv_mod_p: u64,
616 n_inv_mod_p_shoup: u64,
617) {
618 simd.vectorize(
619 #[inline(always)]
620 || {
621 let values = pulp::as_arrays_mut::<8, _>(values).0;
622
623 let n_inv_mod_p = simd.splat_u64x8(n_inv_mod_p);
624 let n_inv_mod_p_shoup = simd.splat_u64x8(n_inv_mod_p_shoup);
625 let neg_p = simd.splat_u64x8(p.wrapping_neg());
626 let p = simd.splat_u64x8(p);
627 let zero = simd.splat_u64x8(0);
628
629 for val_ in values {
630 let val = cast(*val_);
631
632 let shoup_q = simd.widening_mul_u52x8(val, n_inv_mod_p_shoup).1;
634 let t = simd.wrapping_mul_add_u52x8(
635 shoup_q,
636 neg_p,
637 simd.wrapping_mul_add_u52x8(val, n_inv_mod_p, zero),
638 );
639
640 *val_ = cast(simd.small_mod_u64x8(p, t));
641 }
642 },
643 )
644}
645
646#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
647#[cfg(feature = "nightly")]
648fn normalize_avx512(
649 simd: crate::V4,
650 values: &mut [u64],
651 p: u64,
652 n_inv_mod_p: u64,
653 n_inv_mod_p_shoup: u64,
654) {
655 simd.vectorize(
656 #[inline(always)]
657 move || {
658 let values = pulp::as_arrays_mut::<8, _>(values).0;
659
660 let n_inv_mod_p = simd.splat_u64x8(n_inv_mod_p);
661 let n_inv_mod_p_shoup = simd.splat_u64x8(n_inv_mod_p_shoup);
662 let p = simd.splat_u64x8(p);
663
664 for val_ in values {
665 let val = cast(*val_);
666
667 let shoup_q = simd.widening_mul_u64x8(val, n_inv_mod_p_shoup).1;
669 let t = simd.wrapping_sub_u64x8(
670 simd.wrapping_mul_u64x8(val, n_inv_mod_p),
671 simd.wrapping_mul_u64x8(shoup_q, p),
672 );
673
674 *val_ = cast(simd.small_mod_u64x8(p, t));
675 }
676 },
677 );
678}
679
680#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
681fn normalize_avx2(
682 simd: crate::V3,
683 values: &mut [u64],
684 p: u64,
685 n_inv_mod_p: u64,
686 n_inv_mod_p_shoup: u64,
687) {
688 simd.vectorize(
689 #[inline(always)]
690 move || {
691 let values = pulp::as_arrays_mut::<4, _>(values).0;
692
693 let n_inv_mod_p = simd.splat_u64x4(n_inv_mod_p);
694 let n_inv_mod_p_shoup = simd.splat_u64x4(n_inv_mod_p_shoup);
695 let p = simd.splat_u64x4(p);
696
697 for val_ in values {
698 let val = cast(*val_);
699
700 let shoup_q = simd.widening_mul_u64x4(val, n_inv_mod_p_shoup).1;
702 let t = simd.wrapping_sub_u64x4(
703 simd.widening_mul_u64x4(val, n_inv_mod_p).0,
704 simd.widening_mul_u64x4(shoup_q, p).0,
705 );
706
707 *val_ = cast(simd.small_mod_u64x4(p, t));
708 }
709 },
710 );
711}
712
713fn normalize_scalar(values: &mut [u64], p: u64, n_inv_mod_p: u64, n_inv_mod_p_shoup: u64) {
714 for val_ in values {
715 let val = *val_;
716
717 let shoup_q = (((val as u128) * (n_inv_mod_p_shoup as u128)) >> 64) as u64;
718 let t = u64::wrapping_sub(val.wrapping_mul(n_inv_mod_p), shoup_q.wrapping_mul(p));
719
720 *val_ = t.min(t.wrapping_sub(p));
721 }
722}
723
724impl Plan {
725 pub fn try_new(polynomial_size: usize, modulus: u64) -> Option<Self> {
728 let p_div = Div64::new(modulus);
729 if polynomial_size < 16
733 || !polynomial_size.is_power_of_two()
734 || !is_prime64(modulus)
735 || find_primitive_root64(p_div, 2 * polynomial_size as u64).is_none()
736 {
737 None
738 } else {
739 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
740 #[cfg(feature = "nightly")]
741 let has_ifma = (modulus < (1u64 << 51)) && crate::V4IFma::try_new().is_some();
742 #[cfg(not(all(
743 any(target_arch = "x86", target_arch = "x86_64"),
744 feature = "nightly",
745 )))]
746 let has_ifma = false;
747
748 let bits = if has_ifma { 52 } else { 64 };
749
750 let mut twid = avec![0u64; polynomial_size].into_boxed_slice();
751 let mut inv_twid = avec![0u64; polynomial_size].into_boxed_slice();
752 let (mut twid_shoup, mut inv_twid_shoup) = if modulus < (1u64 << 63) {
753 (
754 avec![0u64; polynomial_size].into_boxed_slice(),
755 avec![0u64; polynomial_size].into_boxed_slice(),
756 )
757 } else {
758 (avec![].into_boxed_slice(), avec![].into_boxed_slice())
759 };
760
761 if modulus < (1u64 << 63) {
762 init_negacyclic_twiddles_shoup(
763 modulus,
764 polynomial_size,
765 bits,
766 &mut twid,
767 &mut twid_shoup,
768 &mut inv_twid,
769 &mut inv_twid_shoup,
770 );
771 } else {
772 init_negacyclic_twiddles(modulus, polynomial_size, &mut twid, &mut inv_twid);
773 }
774
775 let n_inv_mod_p = crate::prime::exp_mod64(p_div, polynomial_size as u64, modulus - 2);
776 let n_inv_mod_p_shoup = (((n_inv_mod_p as u128) << bits) / modulus as u128) as u64;
777 let big_q = (modulus.ilog2() + 1) as u64;
778 let big_l = big_q + (bits - 1) as u64;
779 let p_barrett = ((1u128 << big_l) / modulus as u128) as u64;
780
781 Some(Self {
782 twid,
783 twid_shoup,
784 inv_twid_shoup,
785 inv_twid,
786 p: modulus,
787 p_div,
788 p_barrett,
789 big_q,
790 n_inv_mod_p,
791 n_inv_mod_p_shoup,
792 })
793 }
794 }
795
796 pub(crate) fn p_div(&self) -> Div64 {
797 self.p_div
798 }
799
800 #[inline]
802 pub fn ntt_size(&self) -> usize {
803 self.twid.len()
804 }
805
806 #[inline]
808 pub fn modulus(&self) -> u64 {
809 self.p
810 }
811
812 pub fn fwd(&self, buf: &mut [u64]) {
818 assert_eq!(buf.len(), self.ntt_size());
819 let p = self.p;
820
821 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
822 #[cfg(feature = "nightly")]
823 if p < (1u64 << 50) {
824 if let Some(simd) = crate::V4IFma::try_new() {
825 less_than_50bit::fwd_avx512(simd, p, buf, &self.twid, &self.twid_shoup);
826 return;
827 }
828 } else if p < (1u64 << 51) {
829 if let Some(simd) = crate::V4IFma::try_new() {
830 less_than_51bit::fwd_avx512(simd, p, buf, &self.twid, &self.twid_shoup);
831 return;
832 }
833 }
834
835 if p < (1u64 << 62) {
836 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
837 {
838 #[cfg(feature = "nightly")]
839 if let Some(simd) = crate::V4::try_new() {
840 less_than_62bit::fwd_avx512(simd, p, buf, &self.twid, &self.twid_shoup);
841 return;
842 }
843 if let Some(simd) = crate::V3::try_new() {
844 less_than_62bit::fwd_avx2(simd, p, buf, &self.twid, &self.twid_shoup);
845 return;
846 }
847 }
848 less_than_62bit::fwd_scalar(p, buf, &self.twid, &self.twid_shoup);
849 } else if p < (1u64 << 63) {
850 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
851 {
852 #[cfg(feature = "nightly")]
853 if let Some(simd) = crate::V4::try_new() {
854 less_than_63bit::fwd_avx512(simd, p, buf, &self.twid, &self.twid_shoup);
855 return;
856 }
857 if let Some(simd) = crate::V3::try_new() {
858 less_than_63bit::fwd_avx2(simd, p, buf, &self.twid, &self.twid_shoup);
859 return;
860 }
861 }
862 less_than_63bit::fwd_scalar(p, buf, &self.twid, &self.twid_shoup);
863 } else if p == Solinas::P {
864 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
865 {
866 #[cfg(feature = "nightly")]
867 if let Some(simd) = crate::V4::try_new() {
868 generic_solinas::fwd_avx512(simd, buf, Solinas, (), &self.twid);
869 return;
870 }
871 if let Some(simd) = crate::V3::try_new() {
872 generic_solinas::fwd_avx2(simd, buf, Solinas, (), &self.twid);
873 return;
874 }
875 }
876 generic_solinas::fwd_scalar(buf, Solinas, (), &self.twid);
877 } else {
878 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
879 #[cfg(feature = "nightly")]
880 if let Some(simd) = crate::V4::try_new() {
881 let crate::u256 { x0, x1, x2, x3 } = self.p_div.double_reciprocal;
882 let p_div = (p, x0, x1, x2, x3);
883 generic_solinas::fwd_avx512(simd, buf, p, p_div, &self.twid);
884 return;
885 }
886 generic_solinas::fwd_scalar(buf, p, self.p_div, &self.twid);
887 }
888 }
889
890 pub fn inv(&self, buf: &mut [u64]) {
896 assert_eq!(buf.len(), self.ntt_size());
897 let p = self.p;
898
899 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
900 #[cfg(feature = "nightly")]
901 if p < (1u64 << 50) {
902 if let Some(simd) = crate::V4IFma::try_new() {
903 less_than_50bit::inv_avx512(simd, p, buf, &self.inv_twid, &self.inv_twid_shoup);
904 return;
905 }
906 } else if p < (1u64 << 51) {
907 if let Some(simd) = crate::V4IFma::try_new() {
908 less_than_51bit::inv_avx512(simd, p, buf, &self.inv_twid, &self.inv_twid_shoup);
909 return;
910 }
911 }
912
913 if p < (1u64 << 62) {
914 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
915 {
916 #[cfg(feature = "nightly")]
917 if let Some(simd) = crate::V4::try_new() {
918 less_than_62bit::inv_avx512(simd, p, buf, &self.inv_twid, &self.inv_twid_shoup);
919 return;
920 }
921 if let Some(simd) = crate::V3::try_new() {
922 less_than_62bit::inv_avx2(simd, p, buf, &self.inv_twid, &self.inv_twid_shoup);
923 return;
924 }
925 }
926 less_than_62bit::inv_scalar(p, buf, &self.inv_twid, &self.inv_twid_shoup);
927 } else if p < (1u64 << 63) {
928 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
929 {
930 #[cfg(feature = "nightly")]
931 if let Some(simd) = crate::V4::try_new() {
932 less_than_63bit::inv_avx512(simd, p, buf, &self.inv_twid, &self.inv_twid_shoup);
933 return;
934 }
935 if let Some(simd) = crate::V3::try_new() {
936 less_than_63bit::inv_avx2(simd, p, buf, &self.inv_twid, &self.inv_twid_shoup);
937 return;
938 }
939 }
940 less_than_63bit::inv_scalar(p, buf, &self.inv_twid, &self.inv_twid_shoup);
941 } else if p == Solinas::P {
942 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
943 {
944 #[cfg(feature = "nightly")]
945 if let Some(simd) = crate::V4::try_new() {
946 generic_solinas::inv_avx512(simd, buf, Solinas, (), &self.inv_twid);
947 return;
948 }
949 if let Some(simd) = crate::V3::try_new() {
950 generic_solinas::inv_avx2(simd, buf, Solinas, (), &self.inv_twid);
951 return;
952 }
953 }
954 generic_solinas::inv_scalar(buf, Solinas, (), &self.inv_twid);
955 } else {
956 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
957 #[cfg(feature = "nightly")]
958 if let Some(simd) = crate::V4::try_new() {
959 let crate::u256 { x0, x1, x2, x3 } = self.p_div.double_reciprocal;
960 let p_div = (p, x0, x1, x2, x3);
961 generic_solinas::inv_avx512(simd, buf, p, p_div, &self.inv_twid);
962 return;
963 }
964 generic_solinas::inv_scalar(buf, p, self.p_div, &self.inv_twid);
965 }
966 }
967
968 pub fn mul_assign_normalize(&self, lhs: &mut [u64], rhs: &[u64]) {
971 let p = self.p;
972
973 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
974 #[cfg(feature = "nightly")]
975 let has_ifma = (p < (1u64 << 51)) && crate::V4IFma::try_new().is_some();
976
977 if p < (1 << 63) {
978 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
979 #[cfg(feature = "nightly")]
980 if has_ifma {
981 let simd = crate::V4IFma::try_new().unwrap();
983 mul_assign_normalize_ifma(
984 simd,
985 lhs,
986 rhs,
987 p,
988 self.p_barrett,
989 self.big_q,
990 self.n_inv_mod_p,
991 self.n_inv_mod_p_shoup,
992 );
993 return;
994 }
995
996 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
997 #[cfg(feature = "nightly")]
998 if let Some(simd) = crate::V4::try_new() {
999 mul_assign_normalize_avx512(
1000 simd,
1001 lhs,
1002 rhs,
1003 p,
1004 self.p_barrett,
1005 self.big_q,
1006 self.n_inv_mod_p,
1007 self.n_inv_mod_p_shoup,
1008 );
1009 return;
1010 }
1011
1012 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
1013 if let Some(simd) = crate::V3::try_new() {
1014 mul_assign_normalize_avx2(
1015 simd,
1016 lhs,
1017 rhs,
1018 p,
1019 self.p_barrett,
1020 self.big_q,
1021 self.n_inv_mod_p,
1022 self.n_inv_mod_p_shoup,
1023 );
1024 return;
1025 }
1026
1027 mul_assign_normalize_scalar(
1028 lhs,
1029 rhs,
1030 p,
1031 self.p_barrett,
1032 self.big_q,
1033 self.n_inv_mod_p,
1034 self.n_inv_mod_p_shoup,
1035 );
1036 } else if p == Solinas::P {
1037 let n_inv_mod_p = self.n_inv_mod_p;
1038 for (lhs_, rhs) in crate::izip!(lhs, rhs) {
1039 let lhs = *lhs_;
1040 let rhs = *rhs;
1041 let prod = <Solinas as PrimeModulus>::mul((), lhs, rhs);
1042 let prod = <Solinas as PrimeModulus>::mul((), prod, n_inv_mod_p);
1043 *lhs_ = prod;
1044 }
1045 } else {
1046 let p_div = self.p_div;
1047 let n_inv_mod_p = self.n_inv_mod_p;
1048 for (lhs_, rhs) in crate::izip!(lhs, rhs) {
1049 let lhs = *lhs_;
1050 let rhs = *rhs;
1051 let prod = <u64 as PrimeModulus>::mul(p_div, lhs, rhs);
1052 let prod = <u64 as PrimeModulus>::mul(p_div, prod, n_inv_mod_p);
1053 *lhs_ = prod;
1054 }
1055 }
1056 }
1057
1058 pub fn normalize(&self, values: &mut [u64]) {
1061 let p = self.p;
1062
1063 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
1064 #[cfg(feature = "nightly")]
1065 let has_ifma = (p < (1u64 << 51)) && crate::V4IFma::try_new().is_some();
1066
1067 if p < (1 << 63) {
1068 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
1069 #[cfg(feature = "nightly")]
1070 if has_ifma {
1071 let simd = crate::V4IFma::try_new().unwrap();
1073 normalize_ifma(simd, values, p, self.n_inv_mod_p, self.n_inv_mod_p_shoup);
1074 return;
1075 }
1076
1077 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
1078 #[cfg(feature = "nightly")]
1079 if let Some(simd) = crate::V4::try_new() {
1080 normalize_avx512(simd, values, p, self.n_inv_mod_p, self.n_inv_mod_p_shoup);
1081 return;
1082 }
1083
1084 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
1085 if let Some(simd) = crate::V3::try_new() {
1086 normalize_avx2(simd, values, p, self.n_inv_mod_p, self.n_inv_mod_p_shoup);
1087 return;
1088 }
1089
1090 normalize_scalar(values, p, self.n_inv_mod_p, self.n_inv_mod_p_shoup);
1091 } else if p == Solinas::P {
1092 let n_inv_mod_p = self.n_inv_mod_p;
1093 for val in values {
1094 let prod = <Solinas as PrimeModulus>::mul((), *val, n_inv_mod_p);
1095 *val = prod;
1096 }
1097 } else {
1098 let n_inv_mod_p = self.n_inv_mod_p;
1099 let p_div = self.p_div;
1100 for val in values {
1101 let prod = <u64 as PrimeModulus>::mul(p_div, *val, n_inv_mod_p);
1102 *val = prod;
1103 }
1104 }
1105 }
1106
1107 pub fn mul_accumulate(&self, acc: &mut [u64], lhs: &[u64], rhs: &[u64]) {
1109 let p = self.p;
1110
1111 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
1112 #[cfg(feature = "nightly")]
1113 let has_ifma = (p < (1u64 << 51)) && crate::V4IFma::try_new().is_some();
1114
1115 if p < (1 << 63) {
1116 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
1117 #[cfg(feature = "nightly")]
1118 if has_ifma {
1119 let simd = crate::V4IFma::try_new().unwrap();
1121 mul_accumulate_ifma(simd, acc, lhs, rhs, p, self.p_barrett, self.big_q);
1122 return;
1123 }
1124
1125 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
1126 #[cfg(feature = "nightly")]
1127 if let Some(simd) = crate::V4::try_new() {
1128 mul_accumulate_avx512(simd, acc, lhs, rhs, p, self.p_barrett, self.big_q);
1129 return;
1130 }
1131
1132 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
1133 if let Some(simd) = crate::V3::try_new() {
1134 mul_accumulate_avx2(simd, acc, lhs, rhs, p, self.p_barrett, self.big_q);
1135 return;
1136 }
1137
1138 mul_accumulate_scalar(acc, lhs, rhs, p, self.p_barrett, self.big_q);
1139 } else if p == Solinas::P {
1140 for (acc, lhs, rhs) in crate::izip!(acc, lhs, rhs) {
1141 let prod = <Solinas as PrimeModulus>::mul((), *lhs, *rhs);
1142 *acc = <Solinas as PrimeModulus>::add(Solinas, *acc, prod);
1143 }
1144 } else {
1145 let p_div = self.p_div;
1146 for (acc, lhs, rhs) in crate::izip!(acc, lhs, rhs) {
1147 let prod = <u64 as PrimeModulus>::mul(p_div, *lhs, *rhs);
1148 *acc = <u64 as PrimeModulus>::add(p, *acc, prod);
1149 }
1150 }
1151 }
1152}
1153
1154#[cfg(test)]
1155pub mod tests {
1156 use super::*;
1157 use crate::{
1158 fastdiv::Div64, prime::largest_prime_in_arithmetic_progression64,
1159 prime64::generic_solinas::PrimeModulus,
1160 };
1161 use alloc::{vec, vec::Vec};
1162 use rand::random;
1163
1164 extern crate alloc;
1165
1166 pub fn add(p: u64, a: u64, b: u64) -> u64 {
1167 let neg_b = p.wrapping_sub(b);
1168 if a >= neg_b {
1169 a - neg_b
1170 } else {
1171 a + b
1172 }
1173 }
1174
1175 pub fn sub(p: u64, a: u64, b: u64) -> u64 {
1176 let neg_b = p.wrapping_sub(b);
1177 if a >= b {
1178 a - b
1179 } else {
1180 a + neg_b
1181 }
1182 }
1183
1184 pub fn mul(p: u64, a: u64, b: u64) -> u64 {
1185 let wide = a as u128 * b as u128;
1186 if p == 0 {
1187 wide as u64
1188 } else {
1189 (wide % p as u128) as u64
1190 }
1191 }
1192
1193 pub fn negacyclic_convolution(n: usize, p: u64, lhs: &[u64], rhs: &[u64]) -> vec::Vec<u64> {
1194 let mut full_convolution = vec![0u64; 2 * n];
1195 let mut negacyclic_convolution = vec![0u64; n];
1196 for i in 0..n {
1197 for j in 0..n {
1198 full_convolution[i + j] = add(p, full_convolution[i + j], mul(p, lhs[i], rhs[j]));
1199 }
1200 }
1201 for i in 0..n {
1202 negacyclic_convolution[i] = sub(p, full_convolution[i], full_convolution[i + n]);
1203 }
1204 negacyclic_convolution
1205 }
1206
1207 pub fn random_lhs_rhs_with_negacyclic_convolution(
1208 n: usize,
1209 p: u64,
1210 ) -> (vec::Vec<u64>, vec::Vec<u64>, vec::Vec<u64>) {
1211 let mut lhs = vec![0u64; n];
1212 let mut rhs = vec![0u64; n];
1213
1214 for x in &mut lhs {
1215 *x = random();
1216 if p != 0 {
1217 *x %= p;
1218 }
1219 }
1220 for x in &mut rhs {
1221 *x = random();
1222 if p != 0 {
1223 *x %= p;
1224 }
1225 }
1226
1227 let lhs = lhs;
1228 let rhs = rhs;
1229
1230 let negacyclic_convolution = negacyclic_convolution(n, p, &lhs, &rhs);
1231 (lhs, rhs, negacyclic_convolution)
1232 }
1233
1234 #[test]
1235 fn test_product() {
1236 for n in [16, 32, 64, 128, 256, 512, 1024] {
1237 for p in [
1238 largest_prime_in_arithmetic_progression64(1 << 16, 1, 1 << 49, 1 << 50).unwrap(),
1239 largest_prime_in_arithmetic_progression64(1 << 16, 1, 1 << 50, 1 << 51).unwrap(),
1240 largest_prime_in_arithmetic_progression64(1 << 16, 1, 1 << 61, 1 << 62).unwrap(),
1241 largest_prime_in_arithmetic_progression64(1 << 16, 1, 1 << 62, 1 << 63).unwrap(),
1242 Solinas::P,
1243 largest_prime_in_arithmetic_progression64(1 << 16, 1, 1 << 63, u64::MAX).unwrap(),
1244 ] {
1245 let plan = Plan::try_new(n, p).unwrap();
1246
1247 let (lhs, rhs, negacyclic_convolution) =
1248 random_lhs_rhs_with_negacyclic_convolution(n, p);
1249
1250 let mut prod = vec![0u64; n];
1251 let mut lhs_fourier = lhs.clone();
1252 let mut rhs_fourier = rhs.clone();
1253
1254 plan.fwd(&mut lhs_fourier);
1255 plan.fwd(&mut rhs_fourier);
1256
1257 for x in &lhs_fourier {
1258 assert!(*x < p);
1259 }
1260 for x in &rhs_fourier {
1261 assert!(*x < p);
1262 }
1263
1264 for i in 0..n {
1265 prod[i] =
1266 <u64 as PrimeModulus>::mul(Div64::new(p), lhs_fourier[i], rhs_fourier[i]);
1267 }
1268 plan.inv(&mut prod);
1269
1270 plan.mul_assign_normalize(&mut lhs_fourier, &rhs_fourier);
1271 plan.inv(&mut lhs_fourier);
1272
1273 for x in &prod {
1274 assert!(*x < p);
1275 }
1276
1277 for i in 0..n {
1278 assert_eq!(
1279 prod[i],
1280 <u64 as PrimeModulus>::mul(
1281 Div64::new(p),
1282 negacyclic_convolution[i],
1283 n as u64
1284 ),
1285 );
1286 }
1287 assert_eq!(lhs_fourier, negacyclic_convolution);
1288 }
1289 }
1290 }
1291
1292 #[test]
1293 fn test_normalize_scalar() {
1294 let p = largest_prime_in_arithmetic_progression64(1 << 16, 1, 1 << 50, 1 << 63).unwrap();
1295 let p_div = Div64::new(p);
1296 let polynomial_size = 128;
1297
1298 let n_inv_mod_p = crate::prime::exp_mod64(p_div, polynomial_size as u64, p - 2);
1299 let n_inv_mod_p_shoup = (((n_inv_mod_p as u128) << 64) / p as u128) as u64;
1300
1301 let mut val = (0..polynomial_size)
1302 .map(|_| rand::random::<u64>() % p)
1303 .collect::<Vec<_>>();
1304 let mut val_target = val.clone();
1305
1306 let mul = |a: u64, b: u64| ((a as u128 * b as u128) % p as u128) as u64;
1307
1308 for val in val_target.iter_mut() {
1309 *val = mul(*val, n_inv_mod_p);
1310 }
1311
1312 normalize_scalar(&mut val, p, n_inv_mod_p, n_inv_mod_p_shoup);
1313 assert_eq!(val, val_target);
1314 }
1315
1316 #[test]
1317 fn test_mul_assign_normalize_scalar() {
1318 let p = largest_prime_in_arithmetic_progression64(1 << 16, 1, 1 << 50, 1 << 63).unwrap();
1319 let p_div = Div64::new(p);
1320 let polynomial_size = 128;
1321
1322 let n_inv_mod_p = crate::prime::exp_mod64(p_div, polynomial_size as u64, p - 2);
1323 let n_inv_mod_p_shoup = (((n_inv_mod_p as u128) << 64) / p as u128) as u64;
1324 let big_q = (p.ilog2() + 1) as u64;
1325 let big_l = big_q + 63;
1326 let p_barrett = ((1u128 << big_l) / p as u128) as u64;
1327
1328 let mut lhs = (0..polynomial_size)
1329 .map(|_| rand::random::<u64>() % p)
1330 .collect::<Vec<_>>();
1331 let mut lhs_target = lhs.clone();
1332 let rhs = (0..polynomial_size)
1333 .map(|_| rand::random::<u64>() % p)
1334 .collect::<Vec<_>>();
1335
1336 let mul = |a: u64, b: u64| ((a as u128 * b as u128) % p as u128) as u64;
1337
1338 for (lhs, rhs) in lhs_target.iter_mut().zip(&rhs) {
1339 *lhs = mul(mul(*lhs, *rhs), n_inv_mod_p);
1340 }
1341
1342 mul_assign_normalize_scalar(
1343 &mut lhs,
1344 &rhs,
1345 p,
1346 p_barrett,
1347 big_q,
1348 n_inv_mod_p,
1349 n_inv_mod_p_shoup,
1350 );
1351 assert_eq!(lhs, lhs_target);
1352 }
1353
1354 #[test]
1355 fn test_mul_accumulate_scalar() {
1356 let p = largest_prime_in_arithmetic_progression64(1 << 16, 1, 1 << 50, 1 << 63).unwrap();
1357 let polynomial_size = 128;
1358
1359 let big_q = (p.ilog2() + 1) as u64;
1360 let big_l = big_q + 63;
1361 let p_barrett = ((1u128 << big_l) / p as u128) as u64;
1362
1363 let mut acc = (0..polynomial_size)
1364 .map(|_| rand::random::<u64>() % p)
1365 .collect::<Vec<_>>();
1366 let mut acc_target = acc.clone();
1367 let lhs = (0..polynomial_size)
1368 .map(|_| rand::random::<u64>() % p)
1369 .collect::<Vec<_>>();
1370 let rhs = (0..polynomial_size)
1371 .map(|_| rand::random::<u64>() % p)
1372 .collect::<Vec<_>>();
1373
1374 let mul = |a: u64, b: u64| ((a as u128 * b as u128) % p as u128) as u64;
1375 let add = |a: u64, b: u64| <u64 as PrimeModulus>::add(p, a, b);
1376
1377 for (acc, lhs, rhs) in crate::izip!(&mut acc_target, &lhs, &rhs) {
1378 *acc = add(mul(*lhs, *rhs), *acc);
1379 }
1380
1381 mul_accumulate_scalar(&mut acc, &lhs, &rhs, p, p_barrett, big_q);
1382 assert_eq!(acc, acc_target);
1383 }
1384
1385 #[test]
1386 fn test_mul_accumulate() {
1387 for p in [
1388 largest_prime_in_arithmetic_progression64(1 << 16, 1, 0, 1 << 51).unwrap(),
1389 largest_prime_in_arithmetic_progression64(1 << 16, 1, 0, 1 << 61).unwrap(),
1390 largest_prime_in_arithmetic_progression64(1 << 16, 1, 0, 1 << 62).unwrap(),
1391 largest_prime_in_arithmetic_progression64(1 << 16, 1, 0, 1 << 63).unwrap(),
1392 largest_prime_in_arithmetic_progression64(1 << 16, 1, 0, u64::MAX).unwrap(),
1393 ] {
1394 let polynomial_size = 128;
1395
1396 let mut acc = (0..polynomial_size)
1397 .map(|_| rand::random::<u64>() % p)
1398 .collect::<Vec<_>>();
1399 let mut acc_target = acc.clone();
1400 let lhs = (0..polynomial_size)
1401 .map(|_| rand::random::<u64>() % p)
1402 .collect::<Vec<_>>();
1403 let rhs = (0..polynomial_size)
1404 .map(|_| rand::random::<u64>() % p)
1405 .collect::<Vec<_>>();
1406
1407 let mul = |a: u64, b: u64| ((a as u128 * b as u128) % p as u128) as u64;
1408 let add = |a: u64, b: u64| ((a as u128 + b as u128) % p as u128) as u64;
1409
1410 for (acc, lhs, rhs) in crate::izip!(&mut acc_target, &lhs, &rhs) {
1411 *acc = add(mul(*lhs, *rhs), *acc);
1412 }
1413
1414 Plan::try_new(polynomial_size, p)
1415 .unwrap()
1416 .mul_accumulate(&mut acc, &lhs, &rhs);
1417 assert_eq!(acc, acc_target);
1418 }
1419 }
1420
1421 #[test]
1422 fn test_mul_assign_normalize() {
1423 for p in [
1424 largest_prime_in_arithmetic_progression64(1 << 16, 1, 0, 1 << 51).unwrap(),
1425 largest_prime_in_arithmetic_progression64(1 << 16, 1, 0, 1 << 61).unwrap(),
1426 largest_prime_in_arithmetic_progression64(1 << 16, 1, 0, 1 << 62).unwrap(),
1427 largest_prime_in_arithmetic_progression64(1 << 16, 1, 0, 1 << 63).unwrap(),
1428 largest_prime_in_arithmetic_progression64(1 << 16, 1, 0, u64::MAX).unwrap(),
1429 ] {
1430 let polynomial_size = 128;
1431 let p_div = Div64::new(p);
1432 let n_inv_mod_p = crate::prime::exp_mod64(p_div, polynomial_size as u64, p - 2);
1433
1434 let mut lhs = (0..polynomial_size)
1435 .map(|_| rand::random::<u64>() % p)
1436 .collect::<Vec<_>>();
1437 let mut lhs_target = lhs.clone();
1438 let rhs = (0..polynomial_size)
1439 .map(|_| rand::random::<u64>() % p)
1440 .collect::<Vec<_>>();
1441
1442 let mul = |a: u64, b: u64| ((a as u128 * b as u128) % p as u128) as u64;
1443
1444 for (lhs, rhs) in lhs_target.iter_mut().zip(&rhs) {
1445 *lhs = mul(mul(*lhs, *rhs), n_inv_mod_p);
1446 }
1447
1448 Plan::try_new(polynomial_size, p)
1449 .unwrap()
1450 .mul_assign_normalize(&mut lhs, &rhs);
1451 assert_eq!(lhs, lhs_target);
1452 }
1453 }
1454
1455 #[test]
1456 fn test_normalize() {
1457 for p in [
1458 largest_prime_in_arithmetic_progression64(1 << 16, 1, 0, 1 << 51).unwrap(),
1459 largest_prime_in_arithmetic_progression64(1 << 16, 1, 0, 1 << 61).unwrap(),
1460 largest_prime_in_arithmetic_progression64(1 << 16, 1, 0, 1 << 62).unwrap(),
1461 largest_prime_in_arithmetic_progression64(1 << 16, 1, 0, 1 << 63).unwrap(),
1462 largest_prime_in_arithmetic_progression64(1 << 16, 1, 0, u64::MAX).unwrap(),
1463 ] {
1464 let polynomial_size = 128;
1465 let p_div = Div64::new(p);
1466 let n_inv_mod_p = crate::prime::exp_mod64(p_div, polynomial_size as u64, p - 2);
1467
1468 let mut val = (0..polynomial_size)
1469 .map(|_| rand::random::<u64>() % p)
1470 .collect::<Vec<_>>();
1471 let mut val_target = val.clone();
1472
1473 let mul = |a: u64, b: u64| ((a as u128 * b as u128) % p as u128) as u64;
1474
1475 for val in &mut val_target {
1476 *val = mul(*val, n_inv_mod_p);
1477 }
1478
1479 Plan::try_new(polynomial_size, p)
1480 .unwrap()
1481 .normalize(&mut val);
1482 assert_eq!(val, val_target);
1483 }
1484 }
1485}
1486
1487#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
1488#[cfg(test)]
1489mod x86_tests {
1490 use super::*;
1491 use crate::prime::largest_prime_in_arithmetic_progression64;
1492 use alloc::vec::Vec;
1493 use rand::random as rnd;
1494
1495 extern crate alloc;
1496
1497 #[test]
1498 fn test_interleaves_and_permutes_u64x4() {
1499 if let Some(simd) = crate::V3::try_new() {
1500 let a = u64x4(rnd(), rnd(), rnd(), rnd());
1501 let b = u64x4(rnd(), rnd(), rnd(), rnd());
1502
1503 assert_eq!(
1504 simd.interleave2_u64x4([a, b]),
1505 [u64x4(a.0, a.1, b.0, b.1), u64x4(a.2, a.3, b.2, b.3)],
1506 );
1507 assert_eq!(
1508 simd.interleave2_u64x4(simd.interleave2_u64x4([a, b])),
1509 [a, b],
1510 );
1511 let w = [rnd(), rnd()];
1512 assert_eq!(simd.permute2_u64x4(w), u64x4(w[0], w[0], w[1], w[1]));
1513
1514 assert_eq!(
1515 simd.interleave1_u64x4([a, b]),
1516 [u64x4(a.0, b.0, a.2, b.2), u64x4(a.1, b.1, a.3, b.3)],
1517 );
1518 assert_eq!(
1519 simd.interleave1_u64x4(simd.interleave1_u64x4([a, b])),
1520 [a, b],
1521 );
1522 let w = [rnd(), rnd(), rnd(), rnd()];
1523 assert_eq!(simd.permute1_u64x4(w), u64x4(w[0], w[2], w[1], w[3]));
1524 }
1525 }
1526
1527 #[cfg(feature = "nightly")]
1528 #[test]
1529 fn test_interleaves_and_permutes_u64x8() {
1530 if let Some(simd) = crate::V4::try_new() {
1531 let a = u64x8(rnd(), rnd(), rnd(), rnd(), rnd(), rnd(), rnd(), rnd());
1532 let b = u64x8(rnd(), rnd(), rnd(), rnd(), rnd(), rnd(), rnd(), rnd());
1533
1534 assert_eq!(
1535 simd.interleave4_u64x8([a, b]),
1536 [
1537 u64x8(a.0, a.1, a.2, a.3, b.0, b.1, b.2, b.3),
1538 u64x8(a.4, a.5, a.6, a.7, b.4, b.5, b.6, b.7),
1539 ],
1540 );
1541 assert_eq!(
1542 simd.interleave4_u64x8(simd.interleave4_u64x8([a, b])),
1543 [a, b],
1544 );
1545 let w = [rnd(), rnd()];
1546 assert_eq!(
1547 simd.permute4_u64x8(w),
1548 u64x8(w[0], w[0], w[0], w[0], w[1], w[1], w[1], w[1]),
1549 );
1550
1551 assert_eq!(
1552 simd.interleave2_u64x8([a, b]),
1553 [
1554 u64x8(a.0, a.1, b.0, b.1, a.4, a.5, b.4, b.5),
1555 u64x8(a.2, a.3, b.2, b.3, a.6, a.7, b.6, b.7),
1556 ],
1557 );
1558 assert_eq!(
1559 simd.interleave2_u64x8(simd.interleave2_u64x8([a, b])),
1560 [a, b],
1561 );
1562 let w = [rnd(), rnd(), rnd(), rnd()];
1563 assert_eq!(
1564 simd.permute2_u64x8(w),
1565 u64x8(w[0], w[0], w[2], w[2], w[1], w[1], w[3], w[3]),
1566 );
1567
1568 assert_eq!(
1569 simd.interleave1_u64x8([a, b]),
1570 [
1571 u64x8(a.0, b.0, a.2, b.2, a.4, b.4, a.6, b.6),
1572 u64x8(a.1, b.1, a.3, b.3, a.5, b.5, a.7, b.7),
1573 ],
1574 );
1575 assert_eq!(
1576 simd.interleave1_u64x8(simd.interleave1_u64x8([a, b])),
1577 [a, b],
1578 );
1579 let w = [rnd(), rnd(), rnd(), rnd(), rnd(), rnd(), rnd(), rnd()];
1580 assert_eq!(
1581 simd.permute1_u64x8(w),
1582 u64x8(w[0], w[4], w[1], w[5], w[2], w[6], w[3], w[7]),
1583 );
1584 }
1585 }
1586
1587 #[cfg(feature = "nightly")]
1588 #[test]
1589 fn test_mul_assign_normalize_ifma() {
1590 if let Some(simd) = crate::V4IFma::try_new() {
1591 let p =
1592 largest_prime_in_arithmetic_progression64(1 << 16, 1, 1 << 50, 1 << 51).unwrap();
1593 let p_div = Div64::new(p);
1594 let polynomial_size = 128;
1595
1596 let n_inv_mod_p = crate::prime::exp_mod64(p_div, polynomial_size as u64, p - 2);
1597 let n_inv_mod_p_shoup = (((n_inv_mod_p as u128) << 52) / p as u128) as u64;
1598 let big_q = (p.ilog2() + 1) as u64;
1599 let big_l = big_q + 51;
1600 let p_barrett = ((1u128 << big_l) / p as u128) as u64;
1601
1602 let mut lhs = (0..polynomial_size)
1603 .map(|_| rand::random::<u64>() % p)
1604 .collect::<Vec<_>>();
1605 let mut lhs_target = lhs.clone();
1606 let rhs = (0..polynomial_size)
1607 .map(|_| rand::random::<u64>() % p)
1608 .collect::<Vec<_>>();
1609
1610 let mul = |a: u64, b: u64| ((a as u128 * b as u128) % p as u128) as u64;
1611
1612 for (lhs, rhs) in lhs_target.iter_mut().zip(&rhs) {
1613 *lhs = mul(mul(*lhs, *rhs), n_inv_mod_p);
1614 }
1615
1616 mul_assign_normalize_ifma(
1617 simd,
1618 &mut lhs,
1619 &rhs,
1620 p,
1621 p_barrett,
1622 big_q,
1623 n_inv_mod_p,
1624 n_inv_mod_p_shoup,
1625 );
1626 assert_eq!(lhs, lhs_target);
1627 }
1628 }
1629
1630 #[cfg(feature = "nightly")]
1631 #[test]
1632 fn test_mul_assign_normalize_avx512() {
1633 if let Some(simd) = crate::V4::try_new() {
1634 let p =
1635 largest_prime_in_arithmetic_progression64(1 << 16, 1, 1 << 50, 1 << 63).unwrap();
1636 let p_div = Div64::new(p);
1637 let polynomial_size = 128;
1638
1639 let n_inv_mod_p = crate::prime::exp_mod64(p_div, polynomial_size as u64, p - 2);
1640 let n_inv_mod_p_shoup = (((n_inv_mod_p as u128) << 64) / p as u128) as u64;
1641 let big_q = (p.ilog2() + 1) as u64;
1642 let big_l = big_q + 63;
1643 let p_barrett = ((1u128 << big_l) / p as u128) as u64;
1644
1645 let mut lhs = (0..polynomial_size)
1646 .map(|_| rand::random::<u64>() % p)
1647 .collect::<Vec<_>>();
1648 let mut lhs_target = lhs.clone();
1649 let rhs = (0..polynomial_size)
1650 .map(|_| rand::random::<u64>() % p)
1651 .collect::<Vec<_>>();
1652
1653 let mul = |a: u64, b: u64| ((a as u128 * b as u128) % p as u128) as u64;
1654
1655 for (lhs, rhs) in lhs_target.iter_mut().zip(&rhs) {
1656 *lhs = mul(mul(*lhs, *rhs), n_inv_mod_p);
1657 }
1658
1659 mul_assign_normalize_avx512(
1660 simd,
1661 &mut lhs,
1662 &rhs,
1663 p,
1664 p_barrett,
1665 big_q,
1666 n_inv_mod_p,
1667 n_inv_mod_p_shoup,
1668 );
1669 assert_eq!(lhs, lhs_target);
1670 }
1671 }
1672
1673 #[test]
1674 fn test_mul_assign_normalize_avx2() {
1675 if let Some(simd) = crate::V3::try_new() {
1676 let p =
1677 largest_prime_in_arithmetic_progression64(1 << 16, 1, 1 << 50, 1 << 63).unwrap();
1678 let p_div = Div64::new(p);
1679 let polynomial_size = 128;
1680
1681 let n_inv_mod_p = crate::prime::exp_mod64(p_div, polynomial_size as u64, p - 2);
1682 let n_inv_mod_p_shoup = (((n_inv_mod_p as u128) << 64) / p as u128) as u64;
1683 let big_q = (p.ilog2() + 1) as u64;
1684 let big_l = big_q + 63;
1685 let p_barrett = ((1u128 << big_l) / p as u128) as u64;
1686
1687 let mut lhs = (0..polynomial_size)
1688 .map(|_| rand::random::<u64>() % p)
1689 .collect::<Vec<_>>();
1690 let mut lhs_target = lhs.clone();
1691 let rhs = (0..polynomial_size)
1692 .map(|_| rand::random::<u64>() % p)
1693 .collect::<Vec<_>>();
1694
1695 let mul = |a: u64, b: u64| ((a as u128 * b as u128) % p as u128) as u64;
1696
1697 for (lhs, rhs) in lhs_target.iter_mut().zip(&rhs) {
1698 *lhs = mul(mul(*lhs, *rhs), n_inv_mod_p);
1699 }
1700
1701 mul_assign_normalize_avx2(
1702 simd,
1703 &mut lhs,
1704 &rhs,
1705 p,
1706 p_barrett,
1707 big_q,
1708 n_inv_mod_p,
1709 n_inv_mod_p_shoup,
1710 );
1711 assert_eq!(lhs, lhs_target);
1712 }
1713 }
1714
1715 #[cfg(feature = "nightly")]
1716 #[test]
1717 fn test_mul_accumulate_ifma() {
1718 if let Some(simd) = crate::V4IFma::try_new() {
1719 let p =
1720 largest_prime_in_arithmetic_progression64(1 << 16, 1, 1 << 50, 1 << 51).unwrap();
1721 let polynomial_size = 128;
1722
1723 let big_q = (p.ilog2() + 1) as u64;
1724 let big_l = big_q + 51;
1725 let p_barrett = ((1u128 << big_l) / p as u128) as u64;
1726
1727 let mut acc = (0..polynomial_size)
1728 .map(|_| rand::random::<u64>() % p)
1729 .collect::<Vec<_>>();
1730 let mut acc_target = acc.clone();
1731 let lhs = (0..polynomial_size)
1732 .map(|_| rand::random::<u64>() % p)
1733 .collect::<Vec<_>>();
1734 let rhs = (0..polynomial_size)
1735 .map(|_| rand::random::<u64>() % p)
1736 .collect::<Vec<_>>();
1737
1738 let mul = |a: u64, b: u64| ((a as u128 * b as u128) % p as u128) as u64;
1739 let add = |a: u64, b: u64| <u64 as PrimeModulus>::add(p, a, b);
1740
1741 for (acc, lhs, rhs) in crate::izip!(&mut acc_target, &lhs, &rhs) {
1742 *acc = add(mul(*lhs, *rhs), *acc);
1743 }
1744
1745 mul_accumulate_ifma(simd, &mut acc, &lhs, &rhs, p, p_barrett, big_q);
1746 assert_eq!(acc, acc_target);
1747 }
1748 }
1749
1750 #[cfg(feature = "nightly")]
1751 #[test]
1752 fn test_mul_accumulate_avx512() {
1753 if let Some(simd) = crate::V4::try_new() {
1754 let p =
1755 largest_prime_in_arithmetic_progression64(1 << 16, 1, 1 << 50, 1 << 63).unwrap();
1756 let polynomial_size = 128;
1757
1758 let big_q = (p.ilog2() + 1) as u64;
1759 let big_l = big_q + 63;
1760 let p_barrett = ((1u128 << big_l) / p as u128) as u64;
1761
1762 let mut acc = (0..polynomial_size)
1763 .map(|_| rand::random::<u64>() % p)
1764 .collect::<Vec<_>>();
1765 let mut acc_target = acc.clone();
1766 let lhs = (0..polynomial_size)
1767 .map(|_| rand::random::<u64>() % p)
1768 .collect::<Vec<_>>();
1769 let rhs = (0..polynomial_size)
1770 .map(|_| rand::random::<u64>() % p)
1771 .collect::<Vec<_>>();
1772
1773 let mul = |a: u64, b: u64| ((a as u128 * b as u128) % p as u128) as u64;
1774 let add = |a: u64, b: u64| <u64 as PrimeModulus>::add(p, a, b);
1775
1776 for (acc, lhs, rhs) in crate::izip!(&mut acc_target, &lhs, &rhs) {
1777 *acc = add(mul(*lhs, *rhs), *acc);
1778 }
1779
1780 mul_accumulate_avx512(simd, &mut acc, &lhs, &rhs, p, p_barrett, big_q);
1781 assert_eq!(acc, acc_target);
1782 }
1783 }
1784
1785 #[test]
1786 fn test_mul_accumulate_avx2() {
1787 if let Some(simd) = crate::V3::try_new() {
1788 let p =
1789 largest_prime_in_arithmetic_progression64(1 << 16, 1, 1 << 50, 1 << 63).unwrap();
1790 let polynomial_size = 128;
1791
1792 let big_q = (p.ilog2() + 1) as u64;
1793 let big_l = big_q + 63;
1794 let p_barrett = ((1u128 << big_l) / p as u128) as u64;
1795
1796 let mut acc = (0..polynomial_size)
1797 .map(|_| rand::random::<u64>() % p)
1798 .collect::<Vec<_>>();
1799 let mut acc_target = acc.clone();
1800 let lhs = (0..polynomial_size)
1801 .map(|_| rand::random::<u64>() % p)
1802 .collect::<Vec<_>>();
1803 let rhs = (0..polynomial_size)
1804 .map(|_| rand::random::<u64>() % p)
1805 .collect::<Vec<_>>();
1806
1807 let mul = |a: u64, b: u64| ((a as u128 * b as u128) % p as u128) as u64;
1808 let add = |a: u64, b: u64| <u64 as PrimeModulus>::add(p, a, b);
1809
1810 for (acc, lhs, rhs) in crate::izip!(&mut acc_target, &lhs, &rhs) {
1811 *acc = add(mul(*lhs, *rhs), *acc);
1812 }
1813
1814 mul_accumulate_avx2(simd, &mut acc, &lhs, &rhs, p, p_barrett, big_q);
1815 assert_eq!(acc, acc_target);
1816 }
1817 }
1818
1819 #[cfg(feature = "nightly")]
1820 #[test]
1821 fn test_normalize_ifma() {
1822 if let Some(simd) = crate::V4IFma::try_new() {
1823 let p =
1824 largest_prime_in_arithmetic_progression64(1 << 16, 1, 1 << 50, 1 << 51).unwrap();
1825 let p_div = Div64::new(p);
1826 let polynomial_size = 128;
1827
1828 let n_inv_mod_p = crate::prime::exp_mod64(p_div, polynomial_size as u64, p - 2);
1829 let n_inv_mod_p_shoup = (((n_inv_mod_p as u128) << 52) / p as u128) as u64;
1830
1831 let mut val = (0..polynomial_size)
1832 .map(|_| rand::random::<u64>() % p)
1833 .collect::<Vec<_>>();
1834 let mut val_target = val.clone();
1835
1836 let mul = |a: u64, b: u64| ((a as u128 * b as u128) % p as u128) as u64;
1837
1838 for val in val_target.iter_mut() {
1839 *val = mul(*val, n_inv_mod_p);
1840 }
1841
1842 normalize_ifma(simd, &mut val, p, n_inv_mod_p, n_inv_mod_p_shoup);
1843 assert_eq!(val, val_target);
1844 }
1845 }
1846
1847 #[cfg(feature = "nightly")]
1848 #[test]
1849 fn test_normalize_avx512() {
1850 if let Some(simd) = crate::V4::try_new() {
1851 let p =
1852 largest_prime_in_arithmetic_progression64(1 << 16, 1, 1 << 50, 1 << 63).unwrap();
1853 let p_div = Div64::new(p);
1854 let polynomial_size = 128;
1855
1856 let n_inv_mod_p = crate::prime::exp_mod64(p_div, polynomial_size as u64, p - 2);
1857 let n_inv_mod_p_shoup = (((n_inv_mod_p as u128) << 64) / p as u128) as u64;
1858
1859 let mut val = (0..polynomial_size)
1860 .map(|_| rand::random::<u64>() % p)
1861 .collect::<Vec<_>>();
1862 let mut val_target = val.clone();
1863
1864 let mul = |a: u64, b: u64| ((a as u128 * b as u128) % p as u128) as u64;
1865
1866 for val in val_target.iter_mut() {
1867 *val = mul(*val, n_inv_mod_p);
1868 }
1869
1870 normalize_avx512(simd, &mut val, p, n_inv_mod_p, n_inv_mod_p_shoup);
1871 assert_eq!(val, val_target);
1872 }
1873 }
1874
1875 #[test]
1876 fn test_normalize_avx2() {
1877 if let Some(simd) = crate::V3::try_new() {
1878 let p =
1879 largest_prime_in_arithmetic_progression64(1 << 16, 1, 1 << 50, 1 << 63).unwrap();
1880 let p_div = Div64::new(p);
1881 let polynomial_size = 128;
1882
1883 let n_inv_mod_p = crate::prime::exp_mod64(p_div, polynomial_size as u64, p - 2);
1884 let n_inv_mod_p_shoup = (((n_inv_mod_p as u128) << 64) / p as u128) as u64;
1885
1886 let mut val = (0..polynomial_size)
1887 .map(|_| rand::random::<u64>() % p)
1888 .collect::<Vec<_>>();
1889 let mut val_target = val.clone();
1890
1891 let mul = |a: u64, b: u64| ((a as u128 * b as u128) % p as u128) as u64;
1892
1893 for val in val_target.iter_mut() {
1894 *val = mul(*val, n_inv_mod_p);
1895 }
1896
1897 normalize_avx2(simd, &mut val, p, n_inv_mod_p, n_inv_mod_p_shoup);
1898 assert_eq!(val, val_target);
1899 }
1900 }
1901
1902 #[test]
1903 fn test_plan_crash_github_11() {
1904 assert!(Plan::try_new(2048, 1024).is_none());
1905 }
1906}