Skip to main content

sqisign_verify/fp/
level3.rs

1//!
2//! The prime is `p = 65 * 2^376 - 1`. Field elements are 7-limb arrays
3//! `[u64; 7]` storing values in unsaturated radix `2^55`: each limb
4//! holds a roughly-55-bit quantity, leaving 9 carry bits at the top.
5//! Arithmetic operates in an internal Montgomery form; canonical
6//! little-endian byte encoding (via `fp_encode` / `fp_decode`)
7//! returns the standard integer representation in `[0, p)`.
8//!
9//! The free functions below implement the limb-level primitives;
10//! the [`FpBackend`] trait impl for [`Level3`] at the bottom of the
11//! file wires them into the generic [`super::Fp`] API.
12
13use super::FpBackend;
14use crate::params::Level3;
15use hybrid_array::Array;
16use subtle::Choice;
17
18/// Number of 64-bit limbs in an `Fp` element.
19pub const NLIMBS: usize = 7;
20
21/// Bit width of each unsaturated limb.
22pub const RADIX: u32 = 55;
23
24/// `2^RADIX - 1`: keeps the low 55 bits of a limb.
25pub const MASK: u64 = (1u64 << RADIX) - 1;
26
27/// The contribution of `p + 1 = 65 * 2^376` to limb 6 in radix-`2^55`:
28/// `65 * 2^376 / 2^(55*6) = 65 * 2^46`. Used as the Montgomery folding
29/// constant inside `modmul` and `modsqr`.
30pub const P6: u64 = 0x10400000000000u64;
31
32/// `2 * P6`, used inside `modadd`, `modsub`, and `modneg` to
33/// keep partial sums in `[-p, p)` before the final reduction step.
34pub const TWO_P6: u64 = 0x20800000000000u64;
35
36/// `0` as 7 radix-`2^55` limbs.
37pub const ZERO: [u64; NLIMBS] = [0; NLIMBS];
38
39/// Internal Montgomery form of `1 mod p`.
40pub const ONE: [u64; NLIMBS] = [
41    0x0000000000000007,
42    0x0000000000000000,
43    0x0000000000000000,
44    0x0000000000000000,
45    0x0000000000000000,
46    0x0000000000000000,
47    0x000e400000000000,
48];
49
50/// Internal Montgomery form of 2⁻¹ mod p.
51pub const TWO_INV: [u64; NLIMBS] = [
52    0x0000000000000003,
53    0x0000000000000000,
54    0x0000000000000000,
55    0x0000000000000000,
56    0x0000000000000000,
57    0x0000000000000000,
58    0x000f400000000000,
59];
60
61/// Internal Montgomery form of 3⁻¹ mod p.
62pub const THREE_INV: [u64; NLIMBS] = [
63    0x0055555555555557,
64    0x002aaaaaaaaaaaaa,
65    0x0055555555555555,
66    0x002aaaaaaaaaaaaa,
67    0x0055555555555555,
68    0x002aaaaaaaaaaaaa,
69    0x000f955555555555,
70];
71
72/// Internal Montgomery form of `2^384 mod p`. Used by
73/// `fp_decode_reduce` when folding 48-byte input blocks.
74pub const R2: [u64; NLIMBS] = [
75    0x0007e07e07e07e26,
76    0x007c0fc0fc0fc0fc,
77    0x0001f81f81f81f81,
78    0x003f03f03f03f03f,
79    0x00607e07e07e07e0,
80    0x000fc0fc0fc0fc0f,
81    0x000e9f81f81f81f8,
82];
83
84/// Conversion constant for `nres`: multiplying by `R2_NRES` and
85/// dividing by Montgomery `R` takes a canonical integer into internal
86/// Montgomery form (`R2_NRES = R^2 mod p` in the internal Montgomery
87/// scheme).
88pub const R2_NRES: [u64; NLIMBS] = [
89    0xfc0fc0fc0fc4d,
90    0x781f81f81f81f8,
91    0x3f03f03f03f03,
92    0x7e07e07e07e07e,
93    0x40fc0fc0fc0fc0,
94    0x1f81f81f81f81f,
95    0xcff03f03f03f0,
96];
97
98/// Propagate carries between limbs, leaving each of limbs 0..5 in
99/// `[0, 2^55)` and limb 6 holding the accumulated high bits. Returns
100/// `0xFFFF_FFFF_FFFF_FFFF` if the value (interpreted as two's-complement
101/// via the top bits of limb 6) is negative, else `0`.
102#[inline]
103pub(crate) fn prop(n: &mut [u64; NLIMBS]) -> u64 {
104    let mask = MASK;
105    let mut carry: i64 = n[0] as i64;
106    carry >>= RADIX;
107    n[0] &= mask;
108    for limb in n.iter_mut().take(6).skip(1) {
109        carry = carry.wrapping_add(*limb as i64);
110        *limb = (carry as u64) & mask;
111        carry >>= RADIX;
112    }
113    n[6] = n[6].wrapping_add(carry as u64);
114    // Sign mask: 0 if non-negative, all-ones if negative.
115    // Limb 6 is not masked to radix width; the sign occupies the high bits.
116    let sign = (n[6] >> 1) >> 62;
117    0u64.wrapping_sub(sign)
118}
119
120/// [`prop`] followed by a conditional add of `p` if the propagation
121/// detected a negative intermediate, then a second [`prop`] pass.
122/// Returns `1` if the value was originally negative (and was just
123/// fixed up), `0` otherwise.
124#[inline]
125pub(crate) fn flatten(n: &mut [u64; NLIMBS]) -> u32 {
126    let carry = prop(n);
127    n[0] = n[0].wrapping_sub(1 & carry);
128    n[6] = n[6].wrapping_add(0x10400000000000u64 & carry);
129    let _ = prop(n);
130    (carry & 1) as u32
131}
132
133/// Final subtract of `p`: tries `n -= p` and adds `p` back if the
134/// result went negative. Returns `1` if the original value was `< p`
135/// (and was preserved), `0` if it was `>= p` (and was reduced).
136#[inline]
137pub(crate) fn modfsb(n: &mut [u64; NLIMBS]) -> u32 {
138    n[0] = n[0].wrapping_add(1);
139    n[6] = n[6].wrapping_sub(0x10400000000000u64);
140    flatten(n)
141}
142
143/// `n <- a + b mod 2p` (lazily reduced).
144#[inline]
145pub(crate) fn modadd(n: &mut [u64; NLIMBS], a: &[u64; NLIMBS], b: &[u64; NLIMBS]) {
146    n[0] = a[0].wrapping_add(b[0]);
147    n[1] = a[1].wrapping_add(b[1]);
148    n[2] = a[2].wrapping_add(b[2]);
149    n[3] = a[3].wrapping_add(b[3]);
150    n[4] = a[4].wrapping_add(b[4]);
151    n[5] = a[5].wrapping_add(b[5]);
152    n[6] = a[6].wrapping_add(b[6]);
153    n[0] = n[0].wrapping_add(2);
154    n[6] = n[6].wrapping_sub(TWO_P6);
155    let carry = prop(n);
156    n[0] = n[0].wrapping_sub(2 & carry);
157    n[6] = n[6].wrapping_add(TWO_P6 & carry);
158    let _ = prop(n);
159}
160
161/// `n <- a - b mod 2p` (lazily reduced).
162#[inline]
163pub(crate) fn modsub(n: &mut [u64; NLIMBS], a: &[u64; NLIMBS], b: &[u64; NLIMBS]) {
164    n[0] = a[0].wrapping_sub(b[0]);
165    n[1] = a[1].wrapping_sub(b[1]);
166    n[2] = a[2].wrapping_sub(b[2]);
167    n[3] = a[3].wrapping_sub(b[3]);
168    n[4] = a[4].wrapping_sub(b[4]);
169    n[5] = a[5].wrapping_sub(b[5]);
170    n[6] = a[6].wrapping_sub(b[6]);
171    let carry = prop(n);
172    n[0] = n[0].wrapping_sub(2 & carry);
173    n[6] = n[6].wrapping_add(TWO_P6 & carry);
174    let _ = prop(n);
175}
176
177/// `n <- -b mod 2p` (lazily reduced).
178#[inline]
179pub(crate) fn modneg(n: &mut [u64; NLIMBS], b: &[u64; NLIMBS]) {
180    n[0] = 0u64.wrapping_sub(b[0]);
181    n[1] = 0u64.wrapping_sub(b[1]);
182    n[2] = 0u64.wrapping_sub(b[2]);
183    n[3] = 0u64.wrapping_sub(b[3]);
184    n[4] = 0u64.wrapping_sub(b[4]);
185    n[5] = 0u64.wrapping_sub(b[5]);
186    n[6] = 0u64.wrapping_sub(b[6]);
187    let carry = prop(n);
188    n[0] = n[0].wrapping_sub(2 & carry);
189    n[6] = n[6].wrapping_add(TWO_P6 & carry);
190    let _ = prop(n);
191}
192
193/// `c <- a * b mod 2p`. Schoolbook 7x7 multiplication in radix `2^55`
194/// with interleaved Montgomery folding: each newly-emitted low limb
195/// `v_i` is added back at limb position `i + 6` as `v_i * P6`, which
196/// represents `v_i * 65 * 2^376 = v_i * (p + 1) = v_i (mod p)` and
197/// effectively divides the final result by Montgomery `R = 2^385`.
198#[inline]
199pub(crate) fn modmul(c: &mut [u64; NLIMBS], a: &[u64; NLIMBS], b: &[u64; NLIMBS]) {
200    let mask = MASK;
201    let p6: u128 = P6 as u128;
202
203    // t accumulates the partial sum at the current limb position. After
204    // emitting each limb (`t & mask`) we shift right by RADIX.
205    let mut t: u128;
206
207    t = (a[0] as u128) * (b[0] as u128);
208    let v0 = (t as u64) & mask;
209    t >>= RADIX;
210
211    t = t
212        .wrapping_add((a[0] as u128) * (b[1] as u128))
213        .wrapping_add((a[1] as u128) * (b[0] as u128));
214    let v1 = (t as u64) & mask;
215    t >>= RADIX;
216
217    t = t
218        .wrapping_add((a[0] as u128) * (b[2] as u128))
219        .wrapping_add((a[1] as u128) * (b[1] as u128))
220        .wrapping_add((a[2] as u128) * (b[0] as u128));
221    let v2 = (t as u64) & mask;
222    t >>= RADIX;
223
224    t = t
225        .wrapping_add((a[0] as u128) * (b[3] as u128))
226        .wrapping_add((a[1] as u128) * (b[2] as u128))
227        .wrapping_add((a[2] as u128) * (b[1] as u128))
228        .wrapping_add((a[3] as u128) * (b[0] as u128));
229    let v3 = (t as u64) & mask;
230    t >>= RADIX;
231
232    t = t
233        .wrapping_add((a[0] as u128) * (b[4] as u128))
234        .wrapping_add((a[1] as u128) * (b[3] as u128))
235        .wrapping_add((a[2] as u128) * (b[2] as u128))
236        .wrapping_add((a[3] as u128) * (b[1] as u128))
237        .wrapping_add((a[4] as u128) * (b[0] as u128));
238    let v4 = (t as u64) & mask;
239    t >>= RADIX;
240
241    t = t
242        .wrapping_add((a[0] as u128) * (b[5] as u128))
243        .wrapping_add((a[1] as u128) * (b[4] as u128))
244        .wrapping_add((a[2] as u128) * (b[3] as u128))
245        .wrapping_add((a[3] as u128) * (b[2] as u128))
246        .wrapping_add((a[4] as u128) * (b[1] as u128))
247        .wrapping_add((a[5] as u128) * (b[0] as u128));
248    let v5 = (t as u64) & mask;
249    t >>= RADIX;
250
251    t = t
252        .wrapping_add((a[0] as u128) * (b[6] as u128))
253        .wrapping_add((a[1] as u128) * (b[5] as u128))
254        .wrapping_add((a[2] as u128) * (b[4] as u128))
255        .wrapping_add((a[3] as u128) * (b[3] as u128))
256        .wrapping_add((a[4] as u128) * (b[2] as u128))
257        .wrapping_add((a[5] as u128) * (b[1] as u128))
258        .wrapping_add((a[6] as u128) * (b[0] as u128))
259        .wrapping_add((v0 as u128) * p6);
260    let v6 = (t as u64) & mask;
261    t >>= RADIX;
262
263    t = t
264        .wrapping_add((a[1] as u128) * (b[6] as u128))
265        .wrapping_add((a[2] as u128) * (b[5] as u128))
266        .wrapping_add((a[3] as u128) * (b[4] as u128))
267        .wrapping_add((a[4] as u128) * (b[3] as u128))
268        .wrapping_add((a[5] as u128) * (b[2] as u128))
269        .wrapping_add((a[6] as u128) * (b[1] as u128))
270        .wrapping_add((v1 as u128) * p6);
271    c[0] = (t as u64) & mask;
272    t >>= RADIX;
273
274    t = t
275        .wrapping_add((a[2] as u128) * (b[6] as u128))
276        .wrapping_add((a[3] as u128) * (b[5] as u128))
277        .wrapping_add((a[4] as u128) * (b[4] as u128))
278        .wrapping_add((a[5] as u128) * (b[3] as u128))
279        .wrapping_add((a[6] as u128) * (b[2] as u128))
280        .wrapping_add((v2 as u128) * p6);
281    c[1] = (t as u64) & mask;
282    t >>= RADIX;
283
284    t = t
285        .wrapping_add((a[3] as u128) * (b[6] as u128))
286        .wrapping_add((a[4] as u128) * (b[5] as u128))
287        .wrapping_add((a[5] as u128) * (b[4] as u128))
288        .wrapping_add((a[6] as u128) * (b[3] as u128))
289        .wrapping_add((v3 as u128) * p6);
290    c[2] = (t as u64) & mask;
291    t >>= RADIX;
292
293    t = t
294        .wrapping_add((a[4] as u128) * (b[6] as u128))
295        .wrapping_add((a[5] as u128) * (b[5] as u128))
296        .wrapping_add((a[6] as u128) * (b[4] as u128))
297        .wrapping_add((v4 as u128) * p6);
298    c[3] = (t as u64) & mask;
299    t >>= RADIX;
300
301    t = t
302        .wrapping_add((a[5] as u128) * (b[6] as u128))
303        .wrapping_add((a[6] as u128) * (b[5] as u128))
304        .wrapping_add((v5 as u128) * p6);
305    c[4] = (t as u64) & mask;
306    t >>= RADIX;
307
308    t = t
309        .wrapping_add((a[6] as u128) * (b[6] as u128))
310        .wrapping_add((v6 as u128) * p6);
311    c[5] = (t as u64) & mask;
312    t >>= RADIX;
313
314    c[6] = t as u64;
315}
316
317/// `c <- a * a mod 2p` (specialized squaring). Each cross-term
318/// `a_i * a_j` (`i != j`) is computed once and doubled, saving roughly
319/// half the partial-product work compared to a general multiply.
320#[inline]
321pub(crate) fn modsqr(c: &mut [u64; NLIMBS], a: &[u64; NLIMBS]) {
322    let mask = MASK;
323    let p6: u128 = P6 as u128;
324
325    let mut t: u128;
326    let mut tot: u128;
327
328    tot = (a[0] as u128) * (a[0] as u128);
329    t = tot;
330    let v0 = (t as u64) & mask;
331    t >>= RADIX;
332
333    tot = (a[0] as u128) * (a[1] as u128);
334    tot = tot.wrapping_mul(2);
335    t = t.wrapping_add(tot);
336    let v1 = (t as u64) & mask;
337    t >>= RADIX;
338
339    tot = (a[0] as u128) * (a[2] as u128);
340    tot = tot.wrapping_mul(2);
341    tot = tot.wrapping_add((a[1] as u128) * (a[1] as u128));
342    t = t.wrapping_add(tot);
343    let v2 = (t as u64) & mask;
344    t >>= RADIX;
345
346    tot = (a[0] as u128) * (a[3] as u128);
347    tot = tot.wrapping_add((a[1] as u128) * (a[2] as u128));
348    tot = tot.wrapping_mul(2);
349    t = t.wrapping_add(tot);
350    let v3 = (t as u64) & mask;
351    t >>= RADIX;
352
353    tot = (a[0] as u128) * (a[4] as u128);
354    tot = tot.wrapping_add((a[1] as u128) * (a[3] as u128));
355    tot = tot.wrapping_mul(2);
356    tot = tot.wrapping_add((a[2] as u128) * (a[2] as u128));
357    t = t.wrapping_add(tot);
358    let v4 = (t as u64) & mask;
359    t >>= RADIX;
360
361    tot = (a[0] as u128) * (a[5] as u128);
362    tot = tot.wrapping_add((a[1] as u128) * (a[4] as u128));
363    tot = tot.wrapping_add((a[2] as u128) * (a[3] as u128));
364    tot = tot.wrapping_mul(2);
365    t = t.wrapping_add(tot);
366    let v5 = (t as u64) & mask;
367    t >>= RADIX;
368
369    tot = (a[0] as u128) * (a[6] as u128);
370    tot = tot.wrapping_add((a[1] as u128) * (a[5] as u128));
371    tot = tot.wrapping_add((a[2] as u128) * (a[4] as u128));
372    tot = tot.wrapping_mul(2);
373    tot = tot.wrapping_add((a[3] as u128) * (a[3] as u128));
374    t = t.wrapping_add(tot);
375    t = t.wrapping_add((v0 as u128) * p6);
376    let v6 = (t as u64) & mask;
377    t >>= RADIX;
378
379    tot = (a[1] as u128) * (a[6] as u128);
380    tot = tot.wrapping_add((a[2] as u128) * (a[5] as u128));
381    tot = tot.wrapping_add((a[3] as u128) * (a[4] as u128));
382    tot = tot.wrapping_mul(2);
383    t = t.wrapping_add(tot);
384    t = t.wrapping_add((v1 as u128) * p6);
385    c[0] = (t as u64) & mask;
386    t >>= RADIX;
387
388    tot = (a[2] as u128) * (a[6] as u128);
389    tot = tot.wrapping_add((a[3] as u128) * (a[5] as u128));
390    tot = tot.wrapping_mul(2);
391    tot = tot.wrapping_add((a[4] as u128) * (a[4] as u128));
392    t = t.wrapping_add(tot);
393    t = t.wrapping_add((v2 as u128) * p6);
394    c[1] = (t as u64) & mask;
395    t >>= RADIX;
396
397    tot = (a[3] as u128) * (a[6] as u128);
398    tot = tot.wrapping_add((a[4] as u128) * (a[5] as u128));
399    tot = tot.wrapping_mul(2);
400    t = t.wrapping_add(tot);
401    t = t.wrapping_add((v3 as u128) * p6);
402    c[2] = (t as u64) & mask;
403    t >>= RADIX;
404
405    tot = (a[4] as u128) * (a[6] as u128);
406    tot = tot.wrapping_mul(2);
407    tot = tot.wrapping_add((a[5] as u128) * (a[5] as u128));
408    t = t.wrapping_add(tot);
409    t = t.wrapping_add((v4 as u128) * p6);
410    c[3] = (t as u64) & mask;
411    t >>= RADIX;
412
413    tot = (a[5] as u128) * (a[6] as u128);
414    tot = tot.wrapping_mul(2);
415    t = t.wrapping_add(tot);
416    t = t.wrapping_add((v5 as u128) * p6);
417    c[4] = (t as u64) & mask;
418    t >>= RADIX;
419
420    tot = (a[6] as u128) * (a[6] as u128);
421    t = t.wrapping_add(tot);
422    t = t.wrapping_add((v6 as u128) * p6);
423    c[5] = (t as u64) & mask;
424    t >>= RADIX;
425
426    c[6] = t as u64;
427}
428
429/// `c <- a`.
430#[inline]
431pub(crate) fn modcpy(c: &mut [u64; NLIMBS], a: &[u64; NLIMBS]) {
432    *c = *a;
433}
434
435/// Square `a` in-place `n` times.
436#[inline]
437pub(crate) fn modnsqr(a: &mut [u64; NLIMBS], n: u32) {
438    for _ in 0..n {
439        let mut tmp = [0u64; NLIMBS];
440        modsqr(&mut tmp, a);
441        *a = tmp;
442    }
443}
444
445/// Square root progenitor: `z <- w^((p-3)/4) mod p` via a fixed
446/// addition chain whose structure encodes the binary representation
447/// of `(p-3)/4` for this prime.
448#[inline]
449pub(crate) fn modpro(z: &mut [u64; NLIMBS], w: &[u64; NLIMBS]) {
450    let mut x = [0u64; NLIMBS];
451    let mut t0 = [0u64; NLIMBS];
452    let mut t1: [u64; NLIMBS];
453    let mut t2 = [0u64; NLIMBS];
454    let mut t3 = [0u64; NLIMBS];
455    let mut t4 = [0u64; NLIMBS];
456    let mut t5 = [0u64; NLIMBS];
457
458    modcpy(&mut x, w);
459    modsqr(z, &x);
460    {
461        let z_copy = *z;
462        modsqr(&mut t0, &z_copy);
463    }
464    {
465        let mut tmp = [0u64; NLIMBS];
466        modmul(&mut tmp, &x, &t0);
467        t1 = tmp;
468    }
469    {
470        let mut tmp = [0u64; NLIMBS];
471        modmul(&mut tmp, z, &t1);
472        *z = tmp;
473    }
474    {
475        let z_copy = *z;
476        modsqr(&mut t0, &z_copy);
477    }
478    {
479        modsqr(&mut t3, &t0);
480    }
481    {
482        modsqr(&mut t4, &t3);
483    }
484    {
485        modsqr(&mut t2, &t4);
486    }
487    modcpy(&mut t5, &t2);
488    modnsqr(&mut t5, 3);
489    {
490        let mut tmp = [0u64; NLIMBS];
491        modmul(&mut tmp, &t2, &t5);
492        t2 = tmp;
493    }
494    modcpy(&mut t5, &t2);
495    modnsqr(&mut t5, 6);
496    {
497        let mut tmp = [0u64; NLIMBS];
498        modmul(&mut tmp, &t2, &t5);
499        t2 = tmp;
500    }
501    modcpy(&mut t5, &t2);
502    modnsqr(&mut t5, 2);
503    {
504        let mut tmp = [0u64; NLIMBS];
505        modmul(&mut tmp, &t4, &t5);
506        t5 = tmp;
507    }
508    modnsqr(&mut t5, 13);
509    {
510        let mut tmp = [0u64; NLIMBS];
511        modmul(&mut tmp, &t2, &t5);
512        t2 = tmp;
513    }
514    modcpy(&mut t5, &t2);
515    modnsqr(&mut t5, 2);
516    {
517        let mut tmp = [0u64; NLIMBS];
518        modmul(&mut tmp, &t4, &t5);
519        t4 = tmp;
520    }
521    modnsqr(&mut t4, 28);
522    {
523        let mut tmp = [0u64; NLIMBS];
524        modmul(&mut tmp, &t2, &t4);
525        t2 = tmp;
526    }
527    {
528        let mut tmp = [0u64; NLIMBS];
529        modsqr(&mut tmp, &t2);
530        t4 = tmp;
531    }
532    {
533        let mut tmp = [0u64; NLIMBS];
534        modmul(&mut tmp, &t3, &t4);
535        t3 = tmp;
536    }
537    modnsqr(&mut t3, 59);
538    {
539        let mut tmp = [0u64; NLIMBS];
540        modmul(&mut tmp, &t2, &t3);
541        t2 = tmp;
542    }
543    {
544        let mut tmp = [0u64; NLIMBS];
545        modmul(&mut tmp, &t1, &t2);
546        t1 = tmp;
547    }
548    {
549        let mut tmp = [0u64; NLIMBS];
550        modmul(&mut tmp, z, &t1);
551        *z = tmp;
552    }
553    {
554        let mut tmp = [0u64; NLIMBS];
555        modmul(&mut tmp, &t0, z);
556        t0 = tmp;
557    }
558    {
559        let mut tmp = [0u64; NLIMBS];
560        modmul(&mut tmp, &t1, &t0);
561        t1 = tmp;
562    }
563    {
564        let mut tmp = [0u64; NLIMBS];
565        modsqr(&mut tmp, &t1);
566        t2 = tmp;
567    }
568    {
569        let mut tmp = [0u64; NLIMBS];
570        modmul(&mut tmp, &t1, &t2);
571        t2 = tmp;
572    }
573    {
574        let mut tmp = [0u64; NLIMBS];
575        modsqr(&mut tmp, &t2);
576        t2 = tmp;
577    }
578    {
579        let mut tmp = [0u64; NLIMBS];
580        modmul(&mut tmp, &t1, &t2);
581        t2 = tmp;
582    }
583    {
584        let mut tmp = [0u64; NLIMBS];
585        modmul(&mut tmp, &t0, &t2);
586        t0 = tmp;
587    }
588    {
589        let mut tmp = [0u64; NLIMBS];
590        modmul(&mut tmp, z, &t0);
591        *z = tmp;
592    }
593    {
594        let mut tmp = [0u64; NLIMBS];
595        modsqr(&mut tmp, z);
596        t2 = tmp;
597    }
598    {
599        let mut tmp = [0u64; NLIMBS];
600        modmul(&mut tmp, z, &t2);
601        t2 = tmp;
602    }
603    {
604        let mut tmp = [0u64; NLIMBS];
605        modmul(&mut tmp, &t0, &t2);
606        t0 = tmp;
607    }
608    {
609        let mut tmp = [0u64; NLIMBS];
610        modmul(&mut tmp, &t1, &t0);
611        t1 = tmp;
612    }
613    modcpy(&mut t2, &t1);
614    modnsqr(&mut t2, 128);
615    {
616        let mut tmp = [0u64; NLIMBS];
617        modmul(&mut tmp, &t1, &t2);
618        t1 = tmp;
619    }
620    {
621        let mut tmp = [0u64; NLIMBS];
622        modmul(&mut tmp, &t0, &t1);
623        t0 = tmp;
624    }
625    modnsqr(&mut t0, 125);
626    {
627        let mut tmp = [0u64; NLIMBS];
628        modmul(&mut tmp, z, &t0);
629        *z = tmp;
630    }
631}
632
633/// `modinv`: `z <- 1 / x mod p`. If `h` is `Some(h)` the precomputed
634/// progenitor `x^((p-3)/4)` is used; otherwise computed via `modpro`.
635#[inline]
636pub(crate) fn modinv(z: &mut [u64; NLIMBS], x: &[u64; NLIMBS], h: Option<&[u64; NLIMBS]>) {
637    let mut s = [0u64; NLIMBS];
638    let mut t = [0u64; NLIMBS];
639    match h {
640        None => modpro(&mut t, x),
641        Some(h) => modcpy(&mut t, h),
642    }
643    modcpy(&mut s, x);
644    modnsqr(&mut t, 2);
645    modmul(z, &s, &t);
646}
647
648/// Convert from canonical integer form to internal Montgomery form
649/// (multiplies by `R2_NRES` and divides by Montgomery `R`).
650#[inline]
651pub(crate) fn nres(n: &mut [u64; NLIMBS], m: &[u64; NLIMBS]) {
652    modmul(n, m, &R2_NRES);
653}
654
655/// Convert from internal Montgomery form back to canonical integer
656/// form, fully reducing modulo `p`.
657#[inline]
658pub(crate) fn redc(m: &mut [u64; NLIMBS], n: &[u64; NLIMBS]) {
659    let mut c = [0u64; NLIMBS];
660    c[0] = 1;
661    modmul(m, n, &c);
662    let _ = modfsb(m);
663}
664
665/// Returns `1` if `a == 0 mod p`, `0` otherwise.
666#[inline]
667pub(crate) fn modis0(a: &[u64; NLIMBS]) -> u32 {
668    let mut c = [0u64; NLIMBS];
669    redc(&mut c, a);
670    let mut d: u64 = 0;
671    for limb in c.iter() {
672        d |= *limb;
673    }
674    ((1u64) & ((d.wrapping_sub(1)) >> RADIX)) as u32
675}
676
677/// Returns `1` if `a == 1 mod p`, `0` otherwise.
678#[inline]
679pub(crate) fn modis1(a: &[u64; NLIMBS]) -> u32 {
680    let mut c = [0u64; NLIMBS];
681    redc(&mut c, a);
682    let mut d: u64 = 0;
683    for limb in c.iter().skip(1) {
684        d |= *limb;
685    }
686    let c0 = c[0];
687    ((1u64) & ((d.wrapping_sub(1)) >> RADIX) & (((c0 ^ 1).wrapping_sub(1)) >> RADIX)) as u32
688}
689
690/// Returns `1` if `a == b mod p`, `0` otherwise. Constant-time: folds
691/// the per-limb differences into a single bit using masked XOR.
692#[inline]
693pub(crate) fn modcmp(a: &[u64; NLIMBS], b: &[u64; NLIMBS]) -> u32 {
694    let mut c = [0u64; NLIMBS];
695    let mut d = [0u64; NLIMBS];
696    redc(&mut c, a);
697    redc(&mut d, b);
698    let mut eq: u64 = 1;
699    for i in 0..NLIMBS {
700        eq &= ((c[i] ^ d[i]).wrapping_sub(1)) >> RADIX;
701    }
702    eq &= 1;
703    eq as u32
704}
705
706/// Returns `1` if `x` is a quadratic residue (or zero), `0`
707/// otherwise. If `h` is `Some`, it is interpreted as the precomputed
708/// progenitor `x^((p-3)/4)`; otherwise the progenitor is computed.
709#[inline]
710pub(crate) fn modqr(x: &[u64; NLIMBS], h: Option<&[u64; NLIMBS]>) -> u32 {
711    let mut r = [0u64; NLIMBS];
712    match h {
713        None => {
714            modpro(&mut r, x);
715            let mut r2 = [0u64; NLIMBS];
716            modsqr(&mut r2, &r);
717            r = r2;
718        }
719        Some(h) => {
720            modsqr(&mut r, h);
721        }
722    }
723    let mut r2 = [0u64; NLIMBS];
724    modmul(&mut r2, &r, x);
725    let r = r2;
726    modis1(&r) | modis0(x)
727}
728
729/// `r <- sqrt(x) mod p`. Since `p = 3 mod 4`, the square root is
730/// `x^((p+1)/4) = x * x^((p-3)/4)`, so the progenitor (cached as `h`
731/// when available) is multiplied by `x` to get the answer. Output is
732/// well-defined modulo a sign.
733#[inline]
734pub(crate) fn modsqrt(r: &mut [u64; NLIMBS], x: &[u64; NLIMBS], h: Option<&[u64; NLIMBS]>) {
735    let mut y = [0u64; NLIMBS];
736    let mut s = [0u64; NLIMBS];
737    match h {
738        None => modpro(&mut y, x),
739        Some(h) => modcpy(&mut y, h),
740    }
741    modmul(&mut s, &y, x);
742    modcpy(r, &s);
743}
744
745/// Set `a` to the small integer `x` in internal Montgomery form.
746#[inline]
747pub(crate) fn modint(a: &mut [u64; NLIMBS], x: u64) {
748    a[0] = x;
749    a[1] = 0;
750    a[2] = 0;
751    a[3] = 0;
752    a[4] = 0;
753    a[5] = 0;
754    a[6] = 0;
755    let a_copy = *a;
756    nres(a, &a_copy);
757}
758
759/// `c <- a * x mod 2p` for a small integer `x`.
760#[inline]
761pub(crate) fn modmli(c: &mut [u64; NLIMBS], a: &[u64; NLIMBS], x: u64) {
762    let mut t = [0u64; NLIMBS];
763    modint(&mut t, x);
764    modmul(c, a, &t);
765}
766
767/// Shift `a` left by `n < 55` bits (per-limb, with carry into the
768/// next limb).
769#[inline]
770pub(crate) fn modshl(a: &mut [u64; NLIMBS], n: u32) {
771    a[6] = (a[6] << n) | (a[5] >> (RADIX - n));
772    for i in (1..=5).rev() {
773        a[i] = ((a[i] << n) & MASK) | (a[i - 1] >> (RADIX - n));
774    }
775    a[0] = (a[0] << n) & MASK;
776}
777
778/// Shift `a` right by `n < 55` bits, returning the low `n`
779/// shifted-out bits.
780#[inline]
781pub(crate) fn modshr(a: &mut [u64; NLIMBS], n: u32) -> u64 {
782    let r = a[0] & ((1u64 << n) - 1);
783    for i in 0..6 {
784        a[i] = (a[i] >> n) | ((a[i + 1] << (RADIX - n)) & MASK);
785    }
786    a[6] >>= n;
787    r
788}
789
790/// Constant-time conditional swap of `f` and `g` if `b == 1`, no-op
791/// if `b == 0`.
792#[inline]
793pub(crate) fn modcsw(b: u64, g: &mut [u64; NLIMBS], f: &mut [u64; NLIMBS]) {
794    let r: u64 = 0x3cc3_c33c_5aa5_a55a;
795    let c0 = (1u64.wrapping_sub(b)).wrapping_add(r);
796    let c1 = b.wrapping_add(r);
797    for i in 0..NLIMBS {
798        let s = g[i];
799        let t = f[i];
800        let w = r.wrapping_mul(t.wrapping_add(s));
801        let new_f = c0
802            .wrapping_mul(t)
803            .wrapping_add(c1.wrapping_mul(s))
804            .wrapping_sub(w);
805        let new_g = c0
806            .wrapping_mul(s)
807            .wrapping_add(c1.wrapping_mul(t))
808            .wrapping_sub(w);
809        f[i] = new_f;
810        g[i] = new_g;
811    }
812}
813
814/// Bytes per encoded Fp element for Level 3.
815const ENCODED_BYTES: usize = 48;
816
817/// Serialize `a` (in internal Montgomery form) to its canonical
818/// 48-byte little-endian representation: convert back to canonical
819/// integer form via [`redc`], then peel off one byte at a time.
820#[inline]
821pub(crate) fn fp_encode(out: &mut [u8], a: &[u64; NLIMBS]) {
822    debug_assert!(out.len() >= ENCODED_BYTES);
823    let mut c = [0u64; NLIMBS];
824    redc(&mut c, a);
825    for byte in out.iter_mut().take(ENCODED_BYTES) {
826        *byte = (c[0] & 0xff) as u8;
827        let _ = modshr(&mut c, 8);
828    }
829}
830
831/// Parse 48 canonical little-endian bytes into an Fp element.
832/// Returns `0xFFFF_FFFF` on in-range input, `0` otherwise; on failure
833/// `out` is zeroed.
834#[inline]
835pub(crate) fn fp_decode(out: &mut [u64; NLIMBS], bytes: &[u8]) -> u32 {
836    if bytes.len() < ENCODED_BYTES {
837        *out = [0; NLIMBS];
838        return 0;
839    }
840    *out = [0; NLIMBS];
841    // Build the integer by shifting in one byte at a time, MSB first.
842    for i in (0..ENCODED_BYTES).rev() {
843        modshl(out, 8);
844        out[0] = out[0].wrapping_add(bytes[i] as u64);
845    }
846    // res is all-ones if the value was in `[0, p)`, all-zeros otherwise.
847    let res_u64 = 0u64.wrapping_sub(modfsb(out) as u64);
848    let res_u32 = res_u64 as u32;
849    let out_copy = *out;
850    nres(out, &out_copy);
851    for limb in out.iter_mut() {
852        *limb &= res_u64;
853    }
854    res_u32
855}
856
857/// Partial reduction of a 6-limb 64-bit-saturated accumulator: split
858/// off the top byte of limb 5 and fold it back into the low limbs
859/// using the identity `65 * 2^376 = 1 (mod p)`. Used by
860/// [`fp_decode_reduce`] to absorb each 48-byte chunk of a long input.
861fn partial_reduce_6(out: &mut [u64; 6], src: &[u64; 6]) {
862    let h = src[5] >> 56;
863    let l = src[5] & 0x00FF_FFFF_FFFF_FFFF;
864    // Add floor(h/65) + (h mod 65) * 2^376 to the low part.
865    let quo = (h.wrapping_mul(0xFC1)) >> 18;
866    let rem = h.wrapping_sub(65u64.wrapping_mul(quo));
867    let (r0, c0) = src[0].overflowing_add(quo);
868    let (r1, c1) = src[1].overflowing_add(c0 as u64);
869    let (r2, c2) = src[2].overflowing_add(c1 as u64);
870    let (r3, c3) = src[3].overflowing_add(c2 as u64);
871    let (r4, c4) = src[4].overflowing_add(c3 as u64);
872    let r5 = l.wrapping_add(rem << 56).wrapping_add(c4 as u64);
873    out[0] = r0;
874    out[1] = r1;
875    out[2] = r2;
876    out[3] = r3;
877    out[4] = r4;
878    out[5] = r5;
879}
880
881/// Parse a little-endian byte string of arbitrary length, reducing it
882/// modulo `p`. Always succeeds. Used to map hash output uniformly into
883/// Fp.
884#[inline]
885pub(crate) fn fp_decode_reduce(out: &mut [u64; NLIMBS], bytes: &[u8]) {
886    *out = [0; NLIMBS];
887    if bytes.is_empty() {
888        return;
889    }
890
891    let mut len = bytes.len();
892    let rem = len % ENCODED_BYTES;
893    if rem != 0 {
894        // Decode a partial trailing block (zero-padded to 48 bytes;
895        // the value is already < p so no reduction is needed).
896        let k = len - rem;
897        let mut tmp = [0u8; ENCODED_BYTES];
898        tmp[..(len - k)].copy_from_slice(&bytes[k..]);
899        let _ = fp_decode(out, &tmp);
900        len = k;
901    }
902
903    while len > 0 {
904        // Shift the accumulator left by 2^384 in the Montgomery sense:
905        // multiplying by `R2` (which represents 2^384) makes room for
906        // the next 48-byte chunk in the high bits.
907        let out_copy = *out;
908        modmul(out, &out_copy, &R2);
909        len -= ENCODED_BYTES;
910        let mut t = [0u64; 6];
911        for (j, t_j) in t.iter_mut().enumerate() {
912            let off = len + j * 8;
913            *t_j = u64::from_le_bytes(
914                bytes[off..off + 8]
915                    .try_into()
916                    .expect("invariant: slice length is exactly 8"),
917            );
918        }
919        let mut reduced = [0u64; 6];
920        partial_reduce_6(&mut reduced, &t);
921        let mut tmp_bytes = [0u8; ENCODED_BYTES];
922        for (j, r) in reduced.iter().enumerate() {
923            tmp_bytes[j * 8..(j + 1) * 8].copy_from_slice(&r.to_le_bytes());
924        }
925        let mut a = [0u64; NLIMBS];
926        let _ = fp_decode(&mut a, &tmp_bytes);
927        let out_copy = *out;
928        modadd(out, &out_copy, &a);
929    }
930}
931
932/// Borrow a `&Array<u64, U7>` as `&[u64; 7]` for the limb-level helpers.
933#[inline]
934fn as_arr(a: &Array<u64, <Level3 as crate::params::SecurityLevel>::FpLimbs>) -> &[u64; NLIMBS] {
935    // SAFETY: Array<u64, U7> has exactly NLIMBS=7 elements, matching [u64; 7]
936    <&[u64; NLIMBS]>::try_from(&a[..]).expect("invariant: Level3 FpLimbs == U7")
937}
938
939/// Mutable borrow of `&mut Array<u64, U7>` as `&mut [u64; 7]`.
940#[inline]
941fn as_arr_mut(
942    a: &mut Array<u64, <Level3 as crate::params::SecurityLevel>::FpLimbs>,
943) -> &mut [u64; NLIMBS] {
944    // SAFETY: Array<u64, U7> has exactly NLIMBS=7 elements, matching [u64; 7]
945    <&mut [u64; NLIMBS]>::try_from(&mut a[..]).expect("invariant: Level3 FpLimbs == U7")
946}
947
948impl FpBackend for Level3 {
949    #[inline]
950    fn set_zero(out: &mut Array<u64, Self::FpLimbs>) {
951        *as_arr_mut(out) = ZERO;
952    }
953
954    #[inline]
955    fn set_one(out: &mut Array<u64, Self::FpLimbs>) {
956        *as_arr_mut(out) = ONE;
957    }
958
959    #[inline]
960    fn set_small(out: &mut Array<u64, Self::FpLimbs>, val: u64) {
961        modint(as_arr_mut(out), val);
962    }
963
964    #[inline]
965    fn is_equal(a: &Array<u64, Self::FpLimbs>, b: &Array<u64, Self::FpLimbs>) -> Choice {
966        Choice::from(modcmp(as_arr(a), as_arr(b)) as u8)
967    }
968
969    #[inline]
970    fn is_zero(a: &Array<u64, Self::FpLimbs>) -> Choice {
971        Choice::from(modis0(as_arr(a)) as u8)
972    }
973
974    #[inline]
975    fn copy(out: &mut Array<u64, Self::FpLimbs>, a: &Array<u64, Self::FpLimbs>) {
976        modcpy(as_arr_mut(out), as_arr(a));
977    }
978
979    #[inline]
980    fn add(
981        out: &mut Array<u64, Self::FpLimbs>,
982        a: &Array<u64, Self::FpLimbs>,
983        b: &Array<u64, Self::FpLimbs>,
984    ) {
985        modadd(as_arr_mut(out), as_arr(a), as_arr(b));
986    }
987
988    #[inline]
989    fn sub(
990        out: &mut Array<u64, Self::FpLimbs>,
991        a: &Array<u64, Self::FpLimbs>,
992        b: &Array<u64, Self::FpLimbs>,
993    ) {
994        modsub(as_arr_mut(out), as_arr(a), as_arr(b));
995    }
996
997    #[inline]
998    fn neg(out: &mut Array<u64, Self::FpLimbs>, a: &Array<u64, Self::FpLimbs>) {
999        modneg(as_arr_mut(out), as_arr(a));
1000    }
1001
1002    #[inline]
1003    fn mul(
1004        out: &mut Array<u64, Self::FpLimbs>,
1005        a: &Array<u64, Self::FpLimbs>,
1006        b: &Array<u64, Self::FpLimbs>,
1007    ) {
1008        modmul(as_arr_mut(out), as_arr(a), as_arr(b));
1009    }
1010
1011    #[inline]
1012    fn sqr(out: &mut Array<u64, Self::FpLimbs>, a: &Array<u64, Self::FpLimbs>) {
1013        modsqr(as_arr_mut(out), as_arr(a));
1014    }
1015
1016    #[inline]
1017    fn inv(out: &mut Array<u64, Self::FpLimbs>, a: &Array<u64, Self::FpLimbs>) {
1018        let a_copy = *as_arr(a);
1019        modinv(as_arr_mut(out), &a_copy, None);
1020    }
1021
1022    #[inline]
1023    fn sqrt(out: &mut Array<u64, Self::FpLimbs>, a: &Array<u64, Self::FpLimbs>) {
1024        let a_copy = *as_arr(a);
1025        modsqrt(as_arr_mut(out), &a_copy, None);
1026    }
1027
1028    #[inline]
1029    fn is_square(a: &Array<u64, Self::FpLimbs>) -> Choice {
1030        Choice::from(modqr(as_arr(a), None) as u8)
1031    }
1032
1033    #[inline]
1034    fn half(out: &mut Array<u64, Self::FpLimbs>, a: &Array<u64, Self::FpLimbs>) {
1035        modmul(as_arr_mut(out), &TWO_INV, as_arr(a));
1036    }
1037
1038    #[inline]
1039    fn div3(out: &mut Array<u64, Self::FpLimbs>, a: &Array<u64, Self::FpLimbs>) {
1040        modmul(as_arr_mut(out), &THREE_INV, as_arr(a));
1041    }
1042
1043    #[inline]
1044    fn exp3div4(out: &mut Array<u64, Self::FpLimbs>, a: &Array<u64, Self::FpLimbs>) {
1045        modpro(as_arr_mut(out), as_arr(a));
1046    }
1047
1048    #[inline]
1049    fn mul_small(out: &mut Array<u64, Self::FpLimbs>, a: &Array<u64, Self::FpLimbs>, val: u32) {
1050        modmli(as_arr_mut(out), as_arr(a), val as u64);
1051    }
1052
1053    #[inline]
1054    fn encode(out: &mut [u8], a: &Array<u64, Self::FpLimbs>) {
1055        fp_encode(out, as_arr(a));
1056    }
1057
1058    #[inline]
1059    fn decode(out: &mut Array<u64, Self::FpLimbs>, bytes: &[u8]) -> Choice {
1060        Choice::from((fp_decode(as_arr_mut(out), bytes) & 1) as u8)
1061    }
1062
1063    #[inline]
1064    fn decode_reduce(out: &mut Array<u64, Self::FpLimbs>, bytes: &[u8]) {
1065        fp_decode_reduce(as_arr_mut(out), bytes);
1066    }
1067
1068    #[inline]
1069    fn cswap(a: &mut Array<u64, Self::FpLimbs>, b: &mut Array<u64, Self::FpLimbs>, ctl: Choice) {
1070        modcsw(ctl.unwrap_u8() as u64, as_arr_mut(a), as_arr_mut(b));
1071    }
1072
1073    #[inline]
1074    fn select(
1075        out: &mut Array<u64, Self::FpLimbs>,
1076        a0: &Array<u64, Self::FpLimbs>,
1077        a1: &Array<u64, Self::FpLimbs>,
1078        ctl: Choice,
1079    ) {
1080        let cw = 0u64.wrapping_sub(ctl.unwrap_u8() as u64);
1081        let a0r = as_arr(a0);
1082        let a1r = as_arr(a1);
1083        let out_r = as_arr_mut(out);
1084        for i in 0..NLIMBS {
1085            out_r[i] = a0r[i] ^ (cw & (a0r[i] ^ a1r[i]));
1086        }
1087    }
1088}