Skip to main content

sqisign_verify/fp/
level5.rs

1//!
2//! The prime is `p = 27 * 2^500 - 1`. Field elements are 9-limb arrays
3//! `[u64; 9]` storing values in unsaturated radix `2^57`: each limb
4//! holds a roughly-57-bit quantity, leaving 7 carry bits at the top.
5//! Arithmetic operates in an internal Montgomery form; canonical
6//! little-endian byte encoding (via `fp_encode` / `fp_decode`)
7//! returns the standard integer representation in `[0, p)`.
8//!
9//! The free functions below implement the limb-level primitives;
10//! the [`FpBackend`] trait impl for [`Level5`] at the bottom of the
11//! file wires them into the generic [`super::Fp`] API.
12
13use super::FpBackend;
14use crate::params::Level5;
15use hybrid_array::Array;
16use subtle::Choice;
17
18/// Number of 64-bit limbs in an `Fp` element.
19pub const NLIMBS: usize = 9;
20
21/// Bit width of each unsaturated limb.
22pub const RADIX: u32 = 57;
23
24/// `2^RADIX - 1`: keeps the low 57 bits of a limb.
25pub const MASK: u64 = (1u64 << RADIX) - 1;
26
27/// The contribution of `p + 1 = 27 * 2^500` to limb 8 in radix-`2^57`:
28/// `27 * 2^500 / 2^(57*8) = 27 * 2^44`. Used as the Montgomery folding
29/// constant inside `modmul` and `modsqr`.
30pub const P8: u64 = 0x1b00000000000u64;
31
32/// `2 * P8`, used inside `modadd`, `modsub`, and `modneg` to
33/// keep partial sums in `[-p, p)` before the final reduction step.
34pub const TWO_P8: u64 = 0x3600000000000u64;
35
36/// `0` as 9 radix-`2^57` limbs.
37pub const ZERO: [u64; NLIMBS] = [0; NLIMBS];
38
39/// Internal Montgomery form of `1 mod p`.
40pub const ONE: [u64; NLIMBS] = [
41    0x000000000000012f,
42    0x0000000000000000,
43    0x0000000000000000,
44    0x0000000000000000,
45    0x0000000000000000,
46    0x0000000000000000,
47    0x0000000000000000,
48    0x0000000000000000,
49    0x0000b00000000000,
50];
51
52/// Internal Montgomery form of 2⁻¹ mod p.
53pub const TWO_INV: [u64; NLIMBS] = [
54    0x0000000000000097,
55    0x0000000000000000,
56    0x0000000000000000,
57    0x0000000000000000,
58    0x0000000000000000,
59    0x0000000000000000,
60    0x0000000000000000,
61    0x0000000000000000,
62    0x0001300000000000,
63];
64
65/// Internal Montgomery form of 3⁻¹ mod p.
66pub const THREE_INV: [u64; NLIMBS] = [
67    0x00aaaaaaaaaaab0f,
68    0x0155555555555555,
69    0x00aaaaaaaaaaaaaa,
70    0x0155555555555555,
71    0x00aaaaaaaaaaaaaa,
72    0x0155555555555555,
73    0x00aaaaaaaaaaaaaa,
74    0x0155555555555555,
75    0x00015aaaaaaaaaaa,
76];
77
78/// Internal Montgomery form of `2^512 mod p`. Used by
79/// `fp_decode_reduce` when folding 64-byte input blocks.
80pub const R2: [u64; NLIMBS] = [
81    0x0012f684bda1e334,
82    0x01425ed097b425ed,
83    0x01684bda12f684bd,
84    0x01ed097b425ed097,
85    0x00bda12f684bda12,
86    0x0097b425ed097b42,
87    0x0012f684bda12f68,
88    0x01425ed097b425ed,
89    0x00008bda12f684bd,
90];
91
92/// Conversion constant for `nres`: multiplying by `R2_NRES` and
93/// dividing by Montgomery `R` takes a canonical integer into internal
94/// Montgomery form (`R2_NRES = R^2 mod p` in the internal Montgomery
95/// scheme).
96pub const R2_NRES: [u64; NLIMBS] = [
97    0x25ed097b43c668,
98    0x84bda12f684bda,
99    0xd097b425ed097b,
100    0x1da12f684bda12f,
101    0x17b425ed097b425,
102    0x12f684bda12f684,
103    0x25ed097b425ed0,
104    0x84bda12f684bda,
105    0x117b425ed097b,
106];
107
108/// Propagate carries between limbs, leaving each of limbs 0..7 in
109/// `[0, 2^57)` and limb 8 holding the accumulated high bits. Returns
110/// `0xFFFF_FFFF_FFFF_FFFF` if the value (interpreted as two's-complement
111/// via the top bits of limb 8) is negative, else `0`.
112#[inline]
113pub(crate) fn prop(n: &mut [u64; NLIMBS]) -> u64 {
114    let mask = MASK;
115    let mut carry: i64 = n[0] as i64;
116    carry >>= RADIX;
117    n[0] &= mask;
118    for limb in n.iter_mut().take(8).skip(1) {
119        carry = carry.wrapping_add(*limb as i64);
120        *limb = (carry as u64) & mask;
121        carry >>= RADIX;
122    }
123    n[8] = n[8].wrapping_add(carry as u64);
124    // Sign mask: 0 if non-negative, all-ones if negative.
125    // Limb 8 is not masked to radix width; the sign occupies the high bits.
126    let sign = (n[8] >> 1) >> 62;
127    0u64.wrapping_sub(sign)
128}
129
130/// [`prop`] followed by a conditional add of `p` if the propagation
131/// detected a negative intermediate, then a second [`prop`] pass.
132/// Returns `1` if the value was originally negative (and was just
133/// fixed up), `0` otherwise.
134#[inline]
135pub(crate) fn flatten(n: &mut [u64; NLIMBS]) -> u32 {
136    let carry = prop(n);
137    n[0] = n[0].wrapping_sub(1 & carry);
138    n[8] = n[8].wrapping_add(0x1b00000000000u64 & carry);
139    let _ = prop(n);
140    (carry & 1) as u32
141}
142
143/// Final subtract of `p`: tries `n -= p` and adds `p` back if the
144/// result went negative. Returns `1` if the original value was `< p`
145/// (and was preserved), `0` if it was `>= p` (and was reduced).
146#[inline]
147pub(crate) fn modfsb(n: &mut [u64; NLIMBS]) -> u32 {
148    n[0] = n[0].wrapping_add(1);
149    n[8] = n[8].wrapping_sub(0x1b00000000000u64);
150    flatten(n)
151}
152
153/// `n <- a + b mod 2p` (lazily reduced).
154#[inline]
155pub(crate) fn modadd(n: &mut [u64; NLIMBS], a: &[u64; NLIMBS], b: &[u64; NLIMBS]) {
156    n[0] = a[0].wrapping_add(b[0]);
157    n[1] = a[1].wrapping_add(b[1]);
158    n[2] = a[2].wrapping_add(b[2]);
159    n[3] = a[3].wrapping_add(b[3]);
160    n[4] = a[4].wrapping_add(b[4]);
161    n[5] = a[5].wrapping_add(b[5]);
162    n[6] = a[6].wrapping_add(b[6]);
163    n[7] = a[7].wrapping_add(b[7]);
164    n[8] = a[8].wrapping_add(b[8]);
165    n[0] = n[0].wrapping_add(2);
166    n[8] = n[8].wrapping_sub(TWO_P8);
167    let carry = prop(n);
168    n[0] = n[0].wrapping_sub(2 & carry);
169    n[8] = n[8].wrapping_add(TWO_P8 & carry);
170    let _ = prop(n);
171}
172
173/// `n <- a - b mod 2p` (lazily reduced).
174#[inline]
175pub(crate) fn modsub(n: &mut [u64; NLIMBS], a: &[u64; NLIMBS], b: &[u64; NLIMBS]) {
176    n[0] = a[0].wrapping_sub(b[0]);
177    n[1] = a[1].wrapping_sub(b[1]);
178    n[2] = a[2].wrapping_sub(b[2]);
179    n[3] = a[3].wrapping_sub(b[3]);
180    n[4] = a[4].wrapping_sub(b[4]);
181    n[5] = a[5].wrapping_sub(b[5]);
182    n[6] = a[6].wrapping_sub(b[6]);
183    n[7] = a[7].wrapping_sub(b[7]);
184    n[8] = a[8].wrapping_sub(b[8]);
185    let carry = prop(n);
186    n[0] = n[0].wrapping_sub(2 & carry);
187    n[8] = n[8].wrapping_add(TWO_P8 & carry);
188    let _ = prop(n);
189}
190
191/// `n <- -b mod 2p` (lazily reduced).
192#[inline]
193pub(crate) fn modneg(n: &mut [u64; NLIMBS], b: &[u64; NLIMBS]) {
194    n[0] = 0u64.wrapping_sub(b[0]);
195    n[1] = 0u64.wrapping_sub(b[1]);
196    n[2] = 0u64.wrapping_sub(b[2]);
197    n[3] = 0u64.wrapping_sub(b[3]);
198    n[4] = 0u64.wrapping_sub(b[4]);
199    n[5] = 0u64.wrapping_sub(b[5]);
200    n[6] = 0u64.wrapping_sub(b[6]);
201    n[7] = 0u64.wrapping_sub(b[7]);
202    n[8] = 0u64.wrapping_sub(b[8]);
203    let carry = prop(n);
204    n[0] = n[0].wrapping_sub(2 & carry);
205    n[8] = n[8].wrapping_add(TWO_P8 & carry);
206    let _ = prop(n);
207}
208
209/// `c <- a * b mod 2p`. Schoolbook 9x9 multiplication in radix `2^57`
210/// with interleaved Montgomery folding: each newly-emitted low limb
211/// `v_i` is added back at limb position `i + 8` as `v_i * P8`, which
212/// represents `v_i * 27 * 2^500 = v_i * (p + 1) = v_i (mod p)` and
213/// effectively divides the final result by Montgomery `R = 2^513`.
214#[inline]
215pub(crate) fn modmul(c: &mut [u64; NLIMBS], a: &[u64; NLIMBS], b: &[u64; NLIMBS]) {
216    let mask = MASK;
217    let p8: u128 = P8 as u128;
218
219    // t accumulates the partial sum at the current limb position. After
220    // emitting each limb (`t & mask`) we shift right by RADIX.
221    let mut t: u128;
222
223    t = (a[0] as u128) * (b[0] as u128);
224    let v0 = (t as u64) & mask;
225    t >>= RADIX;
226
227    t = t
228        .wrapping_add((a[0] as u128) * (b[1] as u128))
229        .wrapping_add((a[1] as u128) * (b[0] as u128));
230    let v1 = (t as u64) & mask;
231    t >>= RADIX;
232
233    t = t
234        .wrapping_add((a[0] as u128) * (b[2] as u128))
235        .wrapping_add((a[1] as u128) * (b[1] as u128))
236        .wrapping_add((a[2] as u128) * (b[0] as u128));
237    let v2 = (t as u64) & mask;
238    t >>= RADIX;
239
240    t = t
241        .wrapping_add((a[0] as u128) * (b[3] as u128))
242        .wrapping_add((a[1] as u128) * (b[2] as u128))
243        .wrapping_add((a[2] as u128) * (b[1] as u128))
244        .wrapping_add((a[3] as u128) * (b[0] as u128));
245    let v3 = (t as u64) & mask;
246    t >>= RADIX;
247
248    t = t
249        .wrapping_add((a[0] as u128) * (b[4] as u128))
250        .wrapping_add((a[1] as u128) * (b[3] as u128))
251        .wrapping_add((a[2] as u128) * (b[2] as u128))
252        .wrapping_add((a[3] as u128) * (b[1] as u128))
253        .wrapping_add((a[4] as u128) * (b[0] as u128));
254    let v4 = (t as u64) & mask;
255    t >>= RADIX;
256
257    t = t
258        .wrapping_add((a[0] as u128) * (b[5] as u128))
259        .wrapping_add((a[1] as u128) * (b[4] as u128))
260        .wrapping_add((a[2] as u128) * (b[3] as u128))
261        .wrapping_add((a[3] as u128) * (b[2] as u128))
262        .wrapping_add((a[4] as u128) * (b[1] as u128))
263        .wrapping_add((a[5] as u128) * (b[0] as u128));
264    let v5 = (t as u64) & mask;
265    t >>= RADIX;
266
267    t = t
268        .wrapping_add((a[0] as u128) * (b[6] as u128))
269        .wrapping_add((a[1] as u128) * (b[5] as u128))
270        .wrapping_add((a[2] as u128) * (b[4] as u128))
271        .wrapping_add((a[3] as u128) * (b[3] as u128))
272        .wrapping_add((a[4] as u128) * (b[2] as u128))
273        .wrapping_add((a[5] as u128) * (b[1] as u128))
274        .wrapping_add((a[6] as u128) * (b[0] as u128));
275    let v6 = (t as u64) & mask;
276    t >>= RADIX;
277
278    t = t
279        .wrapping_add((a[0] as u128) * (b[7] as u128))
280        .wrapping_add((a[1] as u128) * (b[6] as u128))
281        .wrapping_add((a[2] as u128) * (b[5] as u128))
282        .wrapping_add((a[3] as u128) * (b[4] as u128))
283        .wrapping_add((a[4] as u128) * (b[3] as u128))
284        .wrapping_add((a[5] as u128) * (b[2] as u128))
285        .wrapping_add((a[6] as u128) * (b[1] as u128))
286        .wrapping_add((a[7] as u128) * (b[0] as u128));
287    let v7 = (t as u64) & mask;
288    t >>= RADIX;
289
290    t = t
291        .wrapping_add((a[0] as u128) * (b[8] as u128))
292        .wrapping_add((a[1] as u128) * (b[7] as u128))
293        .wrapping_add((a[2] as u128) * (b[6] as u128))
294        .wrapping_add((a[3] as u128) * (b[5] as u128))
295        .wrapping_add((a[4] as u128) * (b[4] as u128))
296        .wrapping_add((a[5] as u128) * (b[3] as u128))
297        .wrapping_add((a[6] as u128) * (b[2] as u128))
298        .wrapping_add((a[7] as u128) * (b[1] as u128))
299        .wrapping_add((a[8] as u128) * (b[0] as u128))
300        .wrapping_add((v0 as u128) * p8);
301    let v8 = (t as u64) & mask;
302    t >>= RADIX;
303
304    t = t
305        .wrapping_add((a[1] as u128) * (b[8] as u128))
306        .wrapping_add((a[2] as u128) * (b[7] as u128))
307        .wrapping_add((a[3] as u128) * (b[6] as u128))
308        .wrapping_add((a[4] as u128) * (b[5] as u128))
309        .wrapping_add((a[5] as u128) * (b[4] as u128))
310        .wrapping_add((a[6] as u128) * (b[3] as u128))
311        .wrapping_add((a[7] as u128) * (b[2] as u128))
312        .wrapping_add((a[8] as u128) * (b[1] as u128))
313        .wrapping_add((v1 as u128) * p8);
314    c[0] = (t as u64) & mask;
315    t >>= RADIX;
316
317    t = t
318        .wrapping_add((a[2] as u128) * (b[8] as u128))
319        .wrapping_add((a[3] as u128) * (b[7] as u128))
320        .wrapping_add((a[4] as u128) * (b[6] as u128))
321        .wrapping_add((a[5] as u128) * (b[5] as u128))
322        .wrapping_add((a[6] as u128) * (b[4] as u128))
323        .wrapping_add((a[7] as u128) * (b[3] as u128))
324        .wrapping_add((a[8] as u128) * (b[2] as u128))
325        .wrapping_add((v2 as u128) * p8);
326    c[1] = (t as u64) & mask;
327    t >>= RADIX;
328
329    t = t
330        .wrapping_add((a[3] as u128) * (b[8] as u128))
331        .wrapping_add((a[4] as u128) * (b[7] as u128))
332        .wrapping_add((a[5] as u128) * (b[6] as u128))
333        .wrapping_add((a[6] as u128) * (b[5] as u128))
334        .wrapping_add((a[7] as u128) * (b[4] as u128))
335        .wrapping_add((a[8] as u128) * (b[3] as u128))
336        .wrapping_add((v3 as u128) * p8);
337    c[2] = (t as u64) & mask;
338    t >>= RADIX;
339
340    t = t
341        .wrapping_add((a[4] as u128) * (b[8] as u128))
342        .wrapping_add((a[5] as u128) * (b[7] as u128))
343        .wrapping_add((a[6] as u128) * (b[6] as u128))
344        .wrapping_add((a[7] as u128) * (b[5] as u128))
345        .wrapping_add((a[8] as u128) * (b[4] as u128))
346        .wrapping_add((v4 as u128) * p8);
347    c[3] = (t as u64) & mask;
348    t >>= RADIX;
349
350    t = t
351        .wrapping_add((a[5] as u128) * (b[8] as u128))
352        .wrapping_add((a[6] as u128) * (b[7] as u128))
353        .wrapping_add((a[7] as u128) * (b[6] as u128))
354        .wrapping_add((a[8] as u128) * (b[5] as u128))
355        .wrapping_add((v5 as u128) * p8);
356    c[4] = (t as u64) & mask;
357    t >>= RADIX;
358
359    t = t
360        .wrapping_add((a[6] as u128) * (b[8] as u128))
361        .wrapping_add((a[7] as u128) * (b[7] as u128))
362        .wrapping_add((a[8] as u128) * (b[6] as u128))
363        .wrapping_add((v6 as u128) * p8);
364    c[5] = (t as u64) & mask;
365    t >>= RADIX;
366
367    t = t
368        .wrapping_add((a[7] as u128) * (b[8] as u128))
369        .wrapping_add((a[8] as u128) * (b[7] as u128))
370        .wrapping_add((v7 as u128) * p8);
371    c[6] = (t as u64) & mask;
372    t >>= RADIX;
373
374    t = t
375        .wrapping_add((a[8] as u128) * (b[8] as u128))
376        .wrapping_add((v8 as u128) * p8);
377    c[7] = (t as u64) & mask;
378    t >>= RADIX;
379
380    c[8] = t as u64;
381}
382
383/// `c <- a * a mod 2p` (specialized squaring). Each cross-term
384/// `a_i * a_j` (`i != j`) is computed once and doubled, saving roughly
385/// half the partial-product work compared to a general multiply.
386#[inline]
387pub(crate) fn modsqr(c: &mut [u64; NLIMBS], a: &[u64; NLIMBS]) {
388    let mask = MASK;
389    let p8: u128 = P8 as u128;
390
391    let mut t: u128;
392    let mut tot: u128;
393
394    tot = (a[0] as u128) * (a[0] as u128);
395    t = tot;
396    let v0 = (t as u64) & mask;
397    t >>= RADIX;
398
399    tot = (a[0] as u128) * (a[1] as u128);
400    tot = tot.wrapping_mul(2);
401    t = t.wrapping_add(tot);
402    let v1 = (t as u64) & mask;
403    t >>= RADIX;
404
405    tot = (a[0] as u128) * (a[2] as u128);
406    tot = tot.wrapping_mul(2);
407    tot = tot.wrapping_add((a[1] as u128) * (a[1] as u128));
408    t = t.wrapping_add(tot);
409    let v2 = (t as u64) & mask;
410    t >>= RADIX;
411
412    tot = (a[0] as u128) * (a[3] as u128);
413    tot = tot.wrapping_add((a[1] as u128) * (a[2] as u128));
414    tot = tot.wrapping_mul(2);
415    t = t.wrapping_add(tot);
416    let v3 = (t as u64) & mask;
417    t >>= RADIX;
418
419    tot = (a[0] as u128) * (a[4] as u128);
420    tot = tot.wrapping_add((a[1] as u128) * (a[3] as u128));
421    tot = tot.wrapping_mul(2);
422    tot = tot.wrapping_add((a[2] as u128) * (a[2] as u128));
423    t = t.wrapping_add(tot);
424    let v4 = (t as u64) & mask;
425    t >>= RADIX;
426
427    tot = (a[0] as u128) * (a[5] as u128);
428    tot = tot.wrapping_add((a[1] as u128) * (a[4] as u128));
429    tot = tot.wrapping_add((a[2] as u128) * (a[3] as u128));
430    tot = tot.wrapping_mul(2);
431    t = t.wrapping_add(tot);
432    let v5 = (t as u64) & mask;
433    t >>= RADIX;
434
435    tot = (a[0] as u128) * (a[6] as u128);
436    tot = tot.wrapping_add((a[1] as u128) * (a[5] as u128));
437    tot = tot.wrapping_add((a[2] as u128) * (a[4] as u128));
438    tot = tot.wrapping_mul(2);
439    tot = tot.wrapping_add((a[3] as u128) * (a[3] as u128));
440    t = t.wrapping_add(tot);
441    let v6 = (t as u64) & mask;
442    t >>= RADIX;
443
444    tot = (a[0] as u128) * (a[7] as u128);
445    tot = tot.wrapping_add((a[1] as u128) * (a[6] as u128));
446    tot = tot.wrapping_add((a[2] as u128) * (a[5] as u128));
447    tot = tot.wrapping_add((a[3] as u128) * (a[4] as u128));
448    tot = tot.wrapping_mul(2);
449    t = t.wrapping_add(tot);
450    let v7 = (t as u64) & mask;
451    t >>= RADIX;
452
453    tot = (a[0] as u128) * (a[8] as u128);
454    tot = tot.wrapping_add((a[1] as u128) * (a[7] as u128));
455    tot = tot.wrapping_add((a[2] as u128) * (a[6] as u128));
456    tot = tot.wrapping_add((a[3] as u128) * (a[5] as u128));
457    tot = tot.wrapping_mul(2);
458    tot = tot.wrapping_add((a[4] as u128) * (a[4] as u128));
459    t = t.wrapping_add(tot);
460    t = t.wrapping_add((v0 as u128) * p8);
461    let v8 = (t as u64) & mask;
462    t >>= RADIX;
463
464    tot = (a[1] as u128) * (a[8] as u128);
465    tot = tot.wrapping_add((a[2] as u128) * (a[7] as u128));
466    tot = tot.wrapping_add((a[3] as u128) * (a[6] as u128));
467    tot = tot.wrapping_add((a[4] as u128) * (a[5] as u128));
468    tot = tot.wrapping_mul(2);
469    t = t.wrapping_add(tot);
470    t = t.wrapping_add((v1 as u128) * p8);
471    c[0] = (t as u64) & mask;
472    t >>= RADIX;
473
474    tot = (a[2] as u128) * (a[8] as u128);
475    tot = tot.wrapping_add((a[3] as u128) * (a[7] as u128));
476    tot = tot.wrapping_add((a[4] as u128) * (a[6] as u128));
477    tot = tot.wrapping_mul(2);
478    tot = tot.wrapping_add((a[5] as u128) * (a[5] as u128));
479    t = t.wrapping_add(tot);
480    t = t.wrapping_add((v2 as u128) * p8);
481    c[1] = (t as u64) & mask;
482    t >>= RADIX;
483
484    tot = (a[3] as u128) * (a[8] as u128);
485    tot = tot.wrapping_add((a[4] as u128) * (a[7] as u128));
486    tot = tot.wrapping_add((a[5] as u128) * (a[6] as u128));
487    tot = tot.wrapping_mul(2);
488    t = t.wrapping_add(tot);
489    t = t.wrapping_add((v3 as u128) * p8);
490    c[2] = (t as u64) & mask;
491    t >>= RADIX;
492
493    tot = (a[4] as u128) * (a[8] as u128);
494    tot = tot.wrapping_add((a[5] as u128) * (a[7] as u128));
495    tot = tot.wrapping_mul(2);
496    tot = tot.wrapping_add((a[6] as u128) * (a[6] as u128));
497    t = t.wrapping_add(tot);
498    t = t.wrapping_add((v4 as u128) * p8);
499    c[3] = (t as u64) & mask;
500    t >>= RADIX;
501
502    tot = (a[5] as u128) * (a[8] as u128);
503    tot = tot.wrapping_add((a[6] as u128) * (a[7] as u128));
504    tot = tot.wrapping_mul(2);
505    t = t.wrapping_add(tot);
506    t = t.wrapping_add((v5 as u128) * p8);
507    c[4] = (t as u64) & mask;
508    t >>= RADIX;
509
510    tot = (a[6] as u128) * (a[8] as u128);
511    tot = tot.wrapping_mul(2);
512    tot = tot.wrapping_add((a[7] as u128) * (a[7] as u128));
513    t = t.wrapping_add(tot);
514    t = t.wrapping_add((v6 as u128) * p8);
515    c[5] = (t as u64) & mask;
516    t >>= RADIX;
517
518    tot = (a[7] as u128) * (a[8] as u128);
519    tot = tot.wrapping_mul(2);
520    t = t.wrapping_add(tot);
521    t = t.wrapping_add((v7 as u128) * p8);
522    c[6] = (t as u64) & mask;
523    t >>= RADIX;
524
525    tot = (a[8] as u128) * (a[8] as u128);
526    t = t.wrapping_add(tot);
527    t = t.wrapping_add((v8 as u128) * p8);
528    c[7] = (t as u64) & mask;
529    t >>= RADIX;
530
531    c[8] = t as u64;
532}
533
534/// `c <- a`.
535#[inline]
536pub(crate) fn modcpy(c: &mut [u64; NLIMBS], a: &[u64; NLIMBS]) {
537    *c = *a;
538}
539
540/// Square `a` in-place `n` times.
541#[inline]
542pub(crate) fn modnsqr(a: &mut [u64; NLIMBS], n: u32) {
543    for _ in 0..n {
544        let mut tmp = [0u64; NLIMBS];
545        modsqr(&mut tmp, a);
546        *a = tmp;
547    }
548}
549
550/// Square root progenitor: `z <- w^((p-3)/4) mod p` via a fixed
551/// addition chain whose structure encodes the binary representation
552/// of `(p-3)/4` for this prime.
553#[inline]
554pub(crate) fn modpro(z: &mut [u64; NLIMBS], w: &[u64; NLIMBS]) {
555    let mut x = [0u64; NLIMBS];
556    let mut t0: [u64; NLIMBS];
557    let mut t1 = [0u64; NLIMBS];
558    let mut t2 = [0u64; NLIMBS];
559    let mut t3 = [0u64; NLIMBS];
560    let mut t4: [u64; NLIMBS];
561    let mut t5 = [0u64; NLIMBS];
562    let mut t6 = [0u64; NLIMBS];
563
564    // modcpy(w, x);
565    modcpy(&mut x, w);
566    // modcpy(x, z);
567    modcpy(z, &x);
568    // modnsqr(z, 2);
569    modnsqr(z, 2);
570    // modmul(x, z, t0);
571    {
572        let mut tmp = [0u64; NLIMBS];
573        modmul(&mut tmp, &x, z);
574        t0 = tmp;
575    }
576    // modmul(x, t0, z);
577    {
578        let mut tmp = [0u64; NLIMBS];
579        modmul(&mut tmp, &x, &t0);
580        *z = tmp;
581    }
582    // modsqr(z, t1);
583    {
584        modsqr(&mut t1, z);
585    }
586    // modmul(x, t1, t1);
587    {
588        let mut tmp = [0u64; NLIMBS];
589        modmul(&mut tmp, &x, &t1);
590        t1 = tmp;
591    }
592    // modsqr(t1, t3);
593    {
594        modsqr(&mut t3, &t1);
595    }
596    // modsqr(t3, t2);
597    {
598        modsqr(&mut t2, &t3);
599    }
600    // modmul(t3, t2, t4);
601    {
602        let mut tmp = [0u64; NLIMBS];
603        modmul(&mut tmp, &t3, &t2);
604        t4 = tmp;
605    }
606    // modsqr(t4, t5);
607    {
608        modsqr(&mut t5, &t4);
609    }
610    // modcpy(t5, t2);
611    modcpy(&mut t2, &t5);
612    // modnsqr(t2, 2);
613    modnsqr(&mut t2, 2);
614    // modsqr(t2, t6);
615    {
616        modsqr(&mut t6, &t2);
617    }
618    // modmul(t2, t6, t6);
619    {
620        let mut tmp = [0u64; NLIMBS];
621        modmul(&mut tmp, &t2, &t6);
622        t6 = tmp;
623    }
624    // modmul(t5, t6, t5);
625    {
626        let mut tmp = [0u64; NLIMBS];
627        modmul(&mut tmp, &t5, &t6);
628        t5 = tmp;
629    }
630    // modnsqr(t5, 5);
631    modnsqr(&mut t5, 5);
632    // modmul(t2, t5, t2);
633    {
634        let mut tmp = [0u64; NLIMBS];
635        modmul(&mut tmp, &t2, &t5);
636        t2 = tmp;
637    }
638    // modcpy(t2, t5);
639    modcpy(&mut t5, &t2);
640    // modnsqr(t5, 12);
641    modnsqr(&mut t5, 12);
642    // modmul(t2, t5, t2);
643    {
644        let mut tmp = [0u64; NLIMBS];
645        modmul(&mut tmp, &t2, &t5);
646        t2 = tmp;
647    }
648    // modcpy(t2, t5);
649    modcpy(&mut t5, &t2);
650    // modnsqr(t5, 2);
651    modnsqr(&mut t5, 2);
652    // modmul(t2, t5, t5);
653    {
654        let mut tmp = [0u64; NLIMBS];
655        modmul(&mut tmp, &t2, &t5);
656        t5 = tmp;
657    }
658    // modmul(t4, t5, t4);
659    {
660        let mut tmp = [0u64; NLIMBS];
661        modmul(&mut tmp, &t4, &t5);
662        t4 = tmp;
663    }
664    // modsqr(t4, t5);
665    {
666        modsqr(&mut t5, &t4);
667    }
668    // modmul(t2, t5, t2);
669    {
670        let mut tmp = [0u64; NLIMBS];
671        modmul(&mut tmp, &t2, &t5);
672        t2 = tmp;
673    }
674    // modmul(t4, t2, t4);
675    {
676        let mut tmp = [0u64; NLIMBS];
677        modmul(&mut tmp, &t4, &t2);
678        t4 = tmp;
679    }
680    // modnsqr(t4, 27);
681    modnsqr(&mut t4, 27);
682    // modmul(t2, t4, t2);
683    {
684        let mut tmp = [0u64; NLIMBS];
685        modmul(&mut tmp, &t2, &t4);
686        t2 = tmp;
687    }
688    // modmul(t1, t2, t2);
689    {
690        let mut tmp = [0u64; NLIMBS];
691        modmul(&mut tmp, &t1, &t2);
692        t2 = tmp;
693    }
694    // modcpy(t2, t4);
695    modcpy(&mut t4, &t2);
696    // modnsqr(t4, 2);
697    modnsqr(&mut t4, 2);
698    // modmul(t3, t4, t3);
699    {
700        let mut tmp = [0u64; NLIMBS];
701        modmul(&mut tmp, &t3, &t4);
702        t3 = tmp;
703    }
704    // modnsqr(t3, 58);
705    modnsqr(&mut t3, 58);
706    // modmul(t2, t3, t2);
707    {
708        let mut tmp = [0u64; NLIMBS];
709        modmul(&mut tmp, &t2, &t3);
710        t2 = tmp;
711    }
712    // modmul(z, t2, z);
713    {
714        let mut tmp = [0u64; NLIMBS];
715        modmul(&mut tmp, z, &t2);
716        *z = tmp;
717    }
718    // modcpy(z, t2);
719    modcpy(&mut t2, z);
720    // modnsqr(t2, 4);
721    modnsqr(&mut t2, 4);
722    // modmul(t1, t2, t1);
723    {
724        let mut tmp = [0u64; NLIMBS];
725        modmul(&mut tmp, &t1, &t2);
726        t1 = tmp;
727    }
728    // modmul(t0, t1, t0);
729    {
730        let mut tmp = [0u64; NLIMBS];
731        modmul(&mut tmp, &t0, &t1);
732        t0 = tmp;
733    }
734    // modmul(t1, t0, t1);
735    {
736        let mut tmp = [0u64; NLIMBS];
737        modmul(&mut tmp, &t1, &t0);
738        t1 = tmp;
739    }
740    // modsqr(t1, t2);
741    {
742        modsqr(&mut t2, &t1);
743    }
744    // modmul(t0, t2, t0);
745    {
746        let mut tmp = [0u64; NLIMBS];
747        modmul(&mut tmp, &t0, &t2);
748        t0 = tmp;
749    }
750    // modcpy(t0, t2);
751    modcpy(&mut t2, &t0);
752    // modnsqr(t2, 2);
753    modnsqr(&mut t2, 2);
754    // modmul(t0, t2, t2);
755    {
756        let mut tmp = [0u64; NLIMBS];
757        modmul(&mut tmp, &t0, &t2);
758        t2 = tmp;
759    }
760    // modmul(t1, t2, t1);
761    {
762        let mut tmp = [0u64; NLIMBS];
763        modmul(&mut tmp, &t1, &t2);
764        t1 = tmp;
765    }
766    // modmul(t0, t1, t0);
767    {
768        let mut tmp = [0u64; NLIMBS];
769        modmul(&mut tmp, &t0, &t1);
770        t0 = tmp;
771    }
772    // modnsqr(t1, 128);
773    modnsqr(&mut t1, 128);
774    // modmul(t0, t1, t1);
775    {
776        let mut tmp = [0u64; NLIMBS];
777        modmul(&mut tmp, &t0, &t1);
778        t1 = tmp;
779    }
780    // modnsqr(t1, 128);
781    modnsqr(&mut t1, 128);
782    // modmul(t0, t1, t0);
783    {
784        let mut tmp = [0u64; NLIMBS];
785        modmul(&mut tmp, &t0, &t1);
786        t0 = tmp;
787    }
788    // modnsqr(t0, 119);
789    modnsqr(&mut t0, 119);
790    // modmul(z, t0, z);
791    {
792        let mut tmp = [0u64; NLIMBS];
793        modmul(&mut tmp, z, &t0);
794        *z = tmp;
795    }
796}
797
798/// `modinv`: `z <- 1 / x mod p`. If `h` is `Some(h)` the precomputed
799/// progenitor `x^((p-3)/4)` is used; otherwise computed via `modpro`.
800#[inline]
801pub(crate) fn modinv(z: &mut [u64; NLIMBS], x: &[u64; NLIMBS], h: Option<&[u64; NLIMBS]>) {
802    let mut s = [0u64; NLIMBS];
803    let mut t = [0u64; NLIMBS];
804    match h {
805        None => modpro(&mut t, x),
806        Some(h) => modcpy(&mut t, h),
807    }
808    modcpy(&mut s, x);
809    modnsqr(&mut t, 2);
810    modmul(z, &s, &t);
811}
812
813/// Convert from canonical integer form to internal Montgomery form
814/// (multiplies by `R2_NRES` and divides by Montgomery `R`).
815#[inline]
816pub(crate) fn nres(n: &mut [u64; NLIMBS], m: &[u64; NLIMBS]) {
817    modmul(n, m, &R2_NRES);
818}
819
820/// Convert from internal Montgomery form back to canonical integer
821/// form, fully reducing modulo `p`.
822#[inline]
823pub(crate) fn redc(m: &mut [u64; NLIMBS], n: &[u64; NLIMBS]) {
824    let mut c = [0u64; NLIMBS];
825    c[0] = 1;
826    modmul(m, n, &c);
827    let _ = modfsb(m);
828}
829
830/// Returns `1` if `a == 0 mod p`, `0` otherwise.
831#[inline]
832pub(crate) fn modis0(a: &[u64; NLIMBS]) -> u32 {
833    let mut c = [0u64; NLIMBS];
834    redc(&mut c, a);
835    let mut d: u64 = 0;
836    for limb in c.iter() {
837        d |= *limb;
838    }
839    ((1u64) & ((d.wrapping_sub(1)) >> RADIX)) as u32
840}
841
842/// Returns `1` if `a == 1 mod p`, `0` otherwise.
843#[inline]
844pub(crate) fn modis1(a: &[u64; NLIMBS]) -> u32 {
845    let mut c = [0u64; NLIMBS];
846    redc(&mut c, a);
847    let mut d: u64 = 0;
848    for limb in c.iter().skip(1) {
849        d |= *limb;
850    }
851    let c0 = c[0];
852    ((1u64) & ((d.wrapping_sub(1)) >> RADIX) & (((c0 ^ 1).wrapping_sub(1)) >> RADIX)) as u32
853}
854
855/// Returns `1` if `a == b mod p`, `0` otherwise. Constant-time: folds
856/// the per-limb differences into a single bit using masked XOR.
857#[inline]
858pub(crate) fn modcmp(a: &[u64; NLIMBS], b: &[u64; NLIMBS]) -> u32 {
859    let mut c = [0u64; NLIMBS];
860    let mut d = [0u64; NLIMBS];
861    redc(&mut c, a);
862    redc(&mut d, b);
863    let mut eq: u64 = 1;
864    for i in 0..NLIMBS {
865        eq &= ((c[i] ^ d[i]).wrapping_sub(1)) >> RADIX;
866    }
867    eq &= 1;
868    eq as u32
869}
870
871/// Returns `1` if `x` is a quadratic residue (or zero), `0`
872/// otherwise. If `h` is `Some`, it is interpreted as the precomputed
873/// progenitor `x^((p-3)/4)`; otherwise the progenitor is computed.
874#[inline]
875pub(crate) fn modqr(x: &[u64; NLIMBS], h: Option<&[u64; NLIMBS]>) -> u32 {
876    let mut r = [0u64; NLIMBS];
877    match h {
878        None => {
879            modpro(&mut r, x);
880            let mut r2 = [0u64; NLIMBS];
881            modsqr(&mut r2, &r);
882            r = r2;
883        }
884        Some(h) => {
885            modsqr(&mut r, h);
886        }
887    }
888    let mut r2 = [0u64; NLIMBS];
889    modmul(&mut r2, &r, x);
890    let r = r2;
891    modis1(&r) | modis0(x)
892}
893
894/// `r <- sqrt(x) mod p`. Since `p = 3 mod 4`, the square root is
895/// `x^((p+1)/4) = x * x^((p-3)/4)`, so the progenitor (cached as `h`
896/// when available) is multiplied by `x` to get the answer. Output is
897/// well-defined modulo a sign.
898#[inline]
899pub(crate) fn modsqrt(r: &mut [u64; NLIMBS], x: &[u64; NLIMBS], h: Option<&[u64; NLIMBS]>) {
900    let mut y = [0u64; NLIMBS];
901    let mut s = [0u64; NLIMBS];
902    match h {
903        None => modpro(&mut y, x),
904        Some(h) => modcpy(&mut y, h),
905    }
906    modmul(&mut s, &y, x);
907    modcpy(r, &s);
908}
909
910/// Set `a` to the small integer `x` in internal Montgomery form.
911#[inline]
912pub(crate) fn modint(a: &mut [u64; NLIMBS], x: u64) {
913    a[0] = x;
914    a[1] = 0;
915    a[2] = 0;
916    a[3] = 0;
917    a[4] = 0;
918    a[5] = 0;
919    a[6] = 0;
920    a[7] = 0;
921    a[8] = 0;
922    let a_copy = *a;
923    nres(a, &a_copy);
924}
925
926/// `c <- a * x mod 2p` for a small integer `x`.
927#[inline]
928pub(crate) fn modmli(c: &mut [u64; NLIMBS], a: &[u64; NLIMBS], x: u64) {
929    let mut t = [0u64; NLIMBS];
930    modint(&mut t, x);
931    modmul(c, a, &t);
932}
933
934/// Shift `a` left by `n < 57` bits (per-limb, with carry into the
935/// next limb).
936#[inline]
937pub(crate) fn modshl(a: &mut [u64; NLIMBS], n: u32) {
938    a[8] = (a[8] << n) | (a[7] >> (RADIX - n));
939    for i in (1..=7).rev() {
940        a[i] = ((a[i] << n) & MASK) | (a[i - 1] >> (RADIX - n));
941    }
942    a[0] = (a[0] << n) & MASK;
943}
944
945/// Shift `a` right by `n < 57` bits, returning the low `n`
946/// shifted-out bits.
947#[inline]
948pub(crate) fn modshr(a: &mut [u64; NLIMBS], n: u32) -> u64 {
949    let r = a[0] & ((1u64 << n) - 1);
950    for i in 0..8 {
951        a[i] = (a[i] >> n) | ((a[i + 1] << (RADIX - n)) & MASK);
952    }
953    a[8] >>= n;
954    r
955}
956
957/// Constant-time conditional swap of `f` and `g` if `b == 1`, no-op
958/// if `b == 0`.
959#[inline]
960pub(crate) fn modcsw(b: u64, g: &mut [u64; NLIMBS], f: &mut [u64; NLIMBS]) {
961    let r: u64 = 0x3cc3_c33c_5aa5_a55a;
962    let c0 = (1u64.wrapping_sub(b)).wrapping_add(r);
963    let c1 = b.wrapping_add(r);
964    for i in 0..NLIMBS {
965        let s = g[i];
966        let t = f[i];
967        let w = r.wrapping_mul(t.wrapping_add(s));
968        let new_f = c0
969            .wrapping_mul(t)
970            .wrapping_add(c1.wrapping_mul(s))
971            .wrapping_sub(w);
972        let new_g = c0
973            .wrapping_mul(s)
974            .wrapping_add(c1.wrapping_mul(t))
975            .wrapping_sub(w);
976        f[i] = new_f;
977        g[i] = new_g;
978    }
979}
980
981/// Bytes per encoded Fp element for Level 5.
982const ENCODED_BYTES: usize = 64;
983
984/// Serialize `a` (in internal Montgomery form) to its canonical
985/// 64-byte little-endian representation: convert back to canonical
986/// integer form via [`redc`], then peel off one byte at a time.
987#[inline]
988pub(crate) fn fp_encode(out: &mut [u8], a: &[u64; NLIMBS]) {
989    debug_assert!(out.len() >= ENCODED_BYTES);
990    let mut c = [0u64; NLIMBS];
991    redc(&mut c, a);
992    for byte in out.iter_mut().take(ENCODED_BYTES) {
993        *byte = (c[0] & 0xff) as u8;
994        let _ = modshr(&mut c, 8);
995    }
996}
997
998/// Parse 64 canonical little-endian bytes into an Fp element.
999/// Returns `0xFFFF_FFFF` on in-range input, `0` otherwise; on failure
1000/// `out` is zeroed.
1001#[inline]
1002pub(crate) fn fp_decode(out: &mut [u64; NLIMBS], bytes: &[u8]) -> u32 {
1003    if bytes.len() < ENCODED_BYTES {
1004        *out = [0; NLIMBS];
1005        return 0;
1006    }
1007    *out = [0; NLIMBS];
1008    // Build the integer by shifting in one byte at a time, MSB first.
1009    for i in (0..ENCODED_BYTES).rev() {
1010        modshl(out, 8);
1011        out[0] = out[0].wrapping_add(bytes[i] as u64);
1012    }
1013    // res is all-ones if the value was in `[0, p)`, all-zeros otherwise.
1014    let res_u64 = 0u64.wrapping_sub(modfsb(out) as u64);
1015    let res_u32 = res_u64 as u32;
1016    let out_copy = *out;
1017    nres(out, &out_copy);
1018    for limb in out.iter_mut() {
1019        *limb &= res_u64;
1020    }
1021    res_u32
1022}
1023
1024/// Partial reduction of an 8-limb 64-bit-saturated accumulator: split
1025/// off the top 12 bits of limb 7 and fold them back into the low limbs
1026/// using the identity `27 * 2^500 = 1 (mod p)`. Used by
1027/// [`fp_decode_reduce`] to absorb each 64-byte chunk of a long input.
1028fn partial_reduce_8(out: &mut [u64; 8], src: &[u64; 8]) {
1029    // Split value in high (12 bits) and low (500 bits) parts.
1030    // 500 bits = 7 full 64-bit words + 52 bits in word 7.
1031    let h = src[7] >> 52;
1032    let l = src[7] & 0x000F_FFFF_FFFF_FFFF;
1033    // 27 * 2^500 = 1 mod p; hence, we add floor(h/27) + (h mod 27)*2^500
1034    // to the low part.
1035    let quo = (h.wrapping_mul(0x12F7)) >> 17;
1036    let rem = h.wrapping_sub(27u64.wrapping_mul(quo));
1037    let (r0, c0) = src[0].overflowing_add(quo);
1038    let (r1, c1) = src[1].overflowing_add(c0 as u64);
1039    let (r2, c2) = src[2].overflowing_add(c1 as u64);
1040    let (r3, c3) = src[3].overflowing_add(c2 as u64);
1041    let (r4, c4) = src[4].overflowing_add(c3 as u64);
1042    let (r5, c5) = src[5].overflowing_add(c4 as u64);
1043    let (r6, c6) = src[6].overflowing_add(c5 as u64);
1044    let r7 = l.wrapping_add(rem << 52).wrapping_add(c6 as u64);
1045    out[0] = r0;
1046    out[1] = r1;
1047    out[2] = r2;
1048    out[3] = r3;
1049    out[4] = r4;
1050    out[5] = r5;
1051    out[6] = r6;
1052    out[7] = r7;
1053}
1054
1055/// Parse a little-endian byte string of arbitrary length, reducing it
1056/// modulo `p`. Always succeeds. Used to map hash output uniformly into
1057/// Fp.
1058#[inline]
1059pub(crate) fn fp_decode_reduce(out: &mut [u64; NLIMBS], bytes: &[u8]) {
1060    *out = [0; NLIMBS];
1061    if bytes.is_empty() {
1062        return;
1063    }
1064
1065    let mut len = bytes.len();
1066    let rem = len % ENCODED_BYTES;
1067    if rem != 0 {
1068        // Decode a partial trailing block (zero-padded to 64 bytes;
1069        // the value is already < p so no reduction is needed).
1070        let k = len - rem;
1071        let mut tmp = [0u8; ENCODED_BYTES];
1072        tmp[..(len - k)].copy_from_slice(&bytes[k..]);
1073        let _ = fp_decode(out, &tmp);
1074        len = k;
1075    }
1076
1077    while len > 0 {
1078        // Shift the accumulator left by 2^512 in the Montgomery sense:
1079        // multiplying by `R2` (which represents 2^512) makes room for
1080        // the next 64-byte chunk in the high bits.
1081        let out_copy = *out;
1082        modmul(out, &out_copy, &R2);
1083        len -= ENCODED_BYTES;
1084        let mut t = [0u64; 8];
1085        for (j, t_j) in t.iter_mut().enumerate() {
1086            let off = len + j * 8;
1087            *t_j = u64::from_le_bytes(
1088                bytes[off..off + 8]
1089                    .try_into()
1090                    .expect("invariant: slice length is exactly 8"),
1091            );
1092        }
1093        let mut reduced = [0u64; 8];
1094        partial_reduce_8(&mut reduced, &t);
1095        let mut tmp_bytes = [0u8; ENCODED_BYTES];
1096        for (j, r) in reduced.iter().enumerate() {
1097            tmp_bytes[j * 8..(j + 1) * 8].copy_from_slice(&r.to_le_bytes());
1098        }
1099        let mut a = [0u64; NLIMBS];
1100        let _ = fp_decode(&mut a, &tmp_bytes);
1101        let out_copy = *out;
1102        modadd(out, &out_copy, &a);
1103    }
1104}
1105
1106/// Borrow a `&Array<u64, U9>` as `&[u64; 9]` for the limb-level helpers.
1107#[inline]
1108fn as_arr(a: &Array<u64, <Level5 as crate::params::SecurityLevel>::FpLimbs>) -> &[u64; NLIMBS] {
1109    // SAFETY: Array<u64, U9> has exactly NLIMBS=9 elements, matching [u64; 9]
1110    <&[u64; NLIMBS]>::try_from(&a[..]).expect("invariant: Level5 FpLimbs == U9")
1111}
1112
1113/// Mutable borrow of `&mut Array<u64, U9>` as `&mut [u64; 9]`.
1114#[inline]
1115fn as_arr_mut(
1116    a: &mut Array<u64, <Level5 as crate::params::SecurityLevel>::FpLimbs>,
1117) -> &mut [u64; NLIMBS] {
1118    // SAFETY: Array<u64, U9> has exactly NLIMBS=9 elements, matching [u64; 9]
1119    <&mut [u64; NLIMBS]>::try_from(&mut a[..]).expect("invariant: Level5 FpLimbs == U9")
1120}
1121
1122impl FpBackend for Level5 {
1123    #[inline]
1124    fn set_zero(out: &mut Array<u64, Self::FpLimbs>) {
1125        *as_arr_mut(out) = ZERO;
1126    }
1127
1128    #[inline]
1129    fn set_one(out: &mut Array<u64, Self::FpLimbs>) {
1130        *as_arr_mut(out) = ONE;
1131    }
1132
1133    #[inline]
1134    fn set_small(out: &mut Array<u64, Self::FpLimbs>, val: u64) {
1135        modint(as_arr_mut(out), val);
1136    }
1137
1138    #[inline]
1139    fn is_equal(a: &Array<u64, Self::FpLimbs>, b: &Array<u64, Self::FpLimbs>) -> Choice {
1140        Choice::from(modcmp(as_arr(a), as_arr(b)) as u8)
1141    }
1142
1143    #[inline]
1144    fn is_zero(a: &Array<u64, Self::FpLimbs>) -> Choice {
1145        Choice::from(modis0(as_arr(a)) as u8)
1146    }
1147
1148    #[inline]
1149    fn copy(out: &mut Array<u64, Self::FpLimbs>, a: &Array<u64, Self::FpLimbs>) {
1150        modcpy(as_arr_mut(out), as_arr(a));
1151    }
1152
1153    #[inline]
1154    fn add(
1155        out: &mut Array<u64, Self::FpLimbs>,
1156        a: &Array<u64, Self::FpLimbs>,
1157        b: &Array<u64, Self::FpLimbs>,
1158    ) {
1159        modadd(as_arr_mut(out), as_arr(a), as_arr(b));
1160    }
1161
1162    #[inline]
1163    fn sub(
1164        out: &mut Array<u64, Self::FpLimbs>,
1165        a: &Array<u64, Self::FpLimbs>,
1166        b: &Array<u64, Self::FpLimbs>,
1167    ) {
1168        modsub(as_arr_mut(out), as_arr(a), as_arr(b));
1169    }
1170
1171    #[inline]
1172    fn neg(out: &mut Array<u64, Self::FpLimbs>, a: &Array<u64, Self::FpLimbs>) {
1173        modneg(as_arr_mut(out), as_arr(a));
1174    }
1175
1176    #[inline]
1177    fn mul(
1178        out: &mut Array<u64, Self::FpLimbs>,
1179        a: &Array<u64, Self::FpLimbs>,
1180        b: &Array<u64, Self::FpLimbs>,
1181    ) {
1182        modmul(as_arr_mut(out), as_arr(a), as_arr(b));
1183    }
1184
1185    #[inline]
1186    fn sqr(out: &mut Array<u64, Self::FpLimbs>, a: &Array<u64, Self::FpLimbs>) {
1187        modsqr(as_arr_mut(out), as_arr(a));
1188    }
1189
1190    #[inline]
1191    fn inv(out: &mut Array<u64, Self::FpLimbs>, a: &Array<u64, Self::FpLimbs>) {
1192        let a_copy = *as_arr(a);
1193        modinv(as_arr_mut(out), &a_copy, None);
1194    }
1195
1196    #[inline]
1197    fn sqrt(out: &mut Array<u64, Self::FpLimbs>, a: &Array<u64, Self::FpLimbs>) {
1198        let a_copy = *as_arr(a);
1199        modsqrt(as_arr_mut(out), &a_copy, None);
1200    }
1201
1202    #[inline]
1203    fn is_square(a: &Array<u64, Self::FpLimbs>) -> Choice {
1204        Choice::from(modqr(as_arr(a), None) as u8)
1205    }
1206
1207    #[inline]
1208    fn half(out: &mut Array<u64, Self::FpLimbs>, a: &Array<u64, Self::FpLimbs>) {
1209        modmul(as_arr_mut(out), &TWO_INV, as_arr(a));
1210    }
1211
1212    #[inline]
1213    fn div3(out: &mut Array<u64, Self::FpLimbs>, a: &Array<u64, Self::FpLimbs>) {
1214        modmul(as_arr_mut(out), &THREE_INV, as_arr(a));
1215    }
1216
1217    #[inline]
1218    fn exp3div4(out: &mut Array<u64, Self::FpLimbs>, a: &Array<u64, Self::FpLimbs>) {
1219        modpro(as_arr_mut(out), as_arr(a));
1220    }
1221
1222    #[inline]
1223    fn mul_small(out: &mut Array<u64, Self::FpLimbs>, a: &Array<u64, Self::FpLimbs>, val: u32) {
1224        modmli(as_arr_mut(out), as_arr(a), val as u64);
1225    }
1226
1227    #[inline]
1228    fn encode(out: &mut [u8], a: &Array<u64, Self::FpLimbs>) {
1229        fp_encode(out, as_arr(a));
1230    }
1231
1232    #[inline]
1233    fn decode(out: &mut Array<u64, Self::FpLimbs>, bytes: &[u8]) -> Choice {
1234        Choice::from((fp_decode(as_arr_mut(out), bytes) & 1) as u8)
1235    }
1236
1237    #[inline]
1238    fn decode_reduce(out: &mut Array<u64, Self::FpLimbs>, bytes: &[u8]) {
1239        fp_decode_reduce(as_arr_mut(out), bytes);
1240    }
1241
1242    #[inline]
1243    fn cswap(a: &mut Array<u64, Self::FpLimbs>, b: &mut Array<u64, Self::FpLimbs>, ctl: Choice) {
1244        modcsw(ctl.unwrap_u8() as u64, as_arr_mut(a), as_arr_mut(b));
1245    }
1246
1247    #[inline]
1248    fn select(
1249        out: &mut Array<u64, Self::FpLimbs>,
1250        a0: &Array<u64, Self::FpLimbs>,
1251        a1: &Array<u64, Self::FpLimbs>,
1252        ctl: Choice,
1253    ) {
1254        let cw = 0u64.wrapping_sub(ctl.unwrap_u8() as u64);
1255        let a0r = as_arr(a0);
1256        let a1r = as_arr(a1);
1257        let out_r = as_arr_mut(out);
1258        for i in 0..NLIMBS {
1259            out_r[i] = a0r[i] ^ (cw & (a0r[i] ^ a1r[i]));
1260        }
1261    }
1262}