p3_goldilocks/
goldilocks.rs

1use alloc::vec;
2use alloc::vec::Vec;
3use core::fmt::{Debug, Display, Formatter};
4use core::hash::{Hash, Hasher};
5use core::iter::{Product, Sum};
6use core::ops::{Add, AddAssign, Div, DivAssign, Mul, MulAssign, Neg, Sub, SubAssign};
7use core::{array, fmt};
8
9use num_bigint::BigUint;
10use p3_challenger::UniformSamplingField;
11use p3_field::exponentiation::exp_10540996611094048183;
12use p3_field::integers::QuotientMap;
13use p3_field::op_assign_macros::{
14    impl_add_assign, impl_div_methods, impl_mul_methods, impl_sub_assign,
15};
16use p3_field::{
17    Field, InjectiveMonomial, Packable, PermutationMonomial, PrimeCharacteristicRing, PrimeField,
18    PrimeField64, RawDataSerializable, TwoAdicField, halve_u64, impl_raw_serializable_primefield64,
19    quotient_map_large_iint, quotient_map_large_uint, quotient_map_small_int,
20};
21use p3_util::{assume, branch_hint, flatten_to_base, gcd_inner};
22use rand::Rng;
23use rand::distr::{Distribution, StandardUniform};
24use serde::{Deserialize, Serialize};
25
26/// The Goldilocks prime
27pub(crate) const P: u64 = 0xFFFF_FFFF_0000_0001;
28
29/// The prime field known as Goldilocks, defined as `F_p` where `p = 2^64 - 2^32 + 1`.
30///
31/// Note that the safety of deriving `Serialize` and `Deserialize` relies on the fact that the internal value can be any u64.
32#[derive(Copy, Clone, Default, Serialize, Deserialize)]
33#[repr(transparent)] // Important for reasoning about memory layout
34#[must_use]
35pub struct Goldilocks {
36    /// Not necessarily canonical.
37    pub(crate) value: u64,
38}
39
40impl Goldilocks {
41    /// Create a new field element from any `u64`.
42    ///
43    /// Any `u64` value is accepted. No reduction is performed since
44    /// Goldilocks uses a non-canonical internal representation.
45    #[inline]
46    pub const fn new(value: u64) -> Self {
47        Self { value }
48    }
49
50    /// Convert a `[u64; N]` array to an array of field elements.
51    ///
52    /// Const version of `input.map(Goldilocks::new)`.
53    #[inline]
54    pub const fn new_array<const N: usize>(input: [u64; N]) -> [Self; N] {
55        let mut output = [Self::ZERO; N];
56        let mut i = 0;
57        while i < N {
58            output[i].value = input[i];
59            i += 1;
60        }
61        output
62    }
63
64    /// Two's complement of `ORDER`, i.e. `2^64 - ORDER = 2^32 - 1`.
65    const NEG_ORDER: u64 = Self::ORDER_U64.wrapping_neg();
66
67    /// A list of generators for the two-adic subgroups of the goldilocks field.
68    ///
69    /// These satisfy the properties that `TWO_ADIC_GENERATORS[0] = 1` and `TWO_ADIC_GENERATORS[i+1]^2 = TWO_ADIC_GENERATORS[i]`.
70    pub const TWO_ADIC_GENERATORS: [Self; 33] = Self::new_array([
71        0x0000000000000001,
72        0xffffffff00000000,
73        0x0001000000000000,
74        0xfffffffeff000001,
75        0xefffffff00000001,
76        0x00003fffffffc000,
77        0x0000008000000000,
78        0xf80007ff08000001,
79        0xbf79143ce60ca966,
80        0x1905d02a5c411f4e,
81        0x9d8f2ad78bfed972,
82        0x0653b4801da1c8cf,
83        0xf2c35199959dfcb6,
84        0x1544ef2335d17997,
85        0xe0ee099310bba1e2,
86        0xf6b2cffe2306baac,
87        0x54df9630bf79450e,
88        0xabd0a6e8aa3d8a0e,
89        0x81281a7b05f9beac,
90        0xfbd41c6b8caa3302,
91        0x30ba2ecd5e93e76d,
92        0xf502aef532322654,
93        0x4b2a18ade67246b5,
94        0xea9d5a1336fbc98b,
95        0x86cdcc31c307e171,
96        0x4bbaf5976ecfefd8,
97        0xed41d05b78d6e286,
98        0x10d78dd8915a171d,
99        0x59049500004a4485,
100        0xdfa8c93ba46d2666,
101        0x7e9bd009b86a0845,
102        0x400a7f755588e659,
103        0x185629dcda58878c,
104    ]);
105
106    /// A list of powers of two from 0 to 95.
107    ///
108    /// Note that 2^{96} = -1 mod P so all powers of two can be simply
109    /// derived from this list.
110    const POWERS_OF_TWO: [Self; 96] = {
111        let mut powers_of_two = [Self::ONE; 96];
112
113        let mut i = 1;
114        while i < 64 {
115            powers_of_two[i] = Self::new(1 << i);
116            i += 1;
117        }
118        let mut var = Self::new(1 << 63);
119        while i < 96 {
120            var = const_add(var, var);
121            powers_of_two[i] = var;
122            i += 1;
123        }
124        powers_of_two
125    };
126}
127
128impl PartialEq for Goldilocks {
129    fn eq(&self, other: &Self) -> bool {
130        self.as_canonical_u64() == other.as_canonical_u64()
131    }
132}
133
134impl Eq for Goldilocks {}
135
136impl Packable for Goldilocks {}
137
138impl Hash for Goldilocks {
139    fn hash<H: Hasher>(&self, state: &mut H) {
140        state.write_u64(self.as_canonical_u64());
141    }
142}
143
144impl Ord for Goldilocks {
145    fn cmp(&self, other: &Self) -> core::cmp::Ordering {
146        self.as_canonical_u64().cmp(&other.as_canonical_u64())
147    }
148}
149
150impl PartialOrd for Goldilocks {
151    fn partial_cmp(&self, other: &Self) -> Option<core::cmp::Ordering> {
152        Some(self.cmp(other))
153    }
154}
155
156impl Display for Goldilocks {
157    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
158        Display::fmt(&self.as_canonical_u64(), f)
159    }
160}
161
162impl Debug for Goldilocks {
163    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
164        Debug::fmt(&self.as_canonical_u64(), f)
165    }
166}
167
168impl Distribution<Goldilocks> for StandardUniform {
169    fn sample<R: Rng + ?Sized>(&self, rng: &mut R) -> Goldilocks {
170        loop {
171            let next_u64 = rng.next_u64();
172            let is_canonical = next_u64 < Goldilocks::ORDER_U64;
173            if is_canonical {
174                return Goldilocks::new(next_u64);
175            }
176        }
177    }
178}
179
180impl UniformSamplingField for Goldilocks {
181    const MAX_SINGLE_SAMPLE_BITS: usize = 24;
182    const SAMPLING_BITS_M: [u64; 64] = {
183        let prime: u64 = P;
184        let mut a = [0u64; 64];
185        let mut k = 0;
186        while k < 64 {
187            if k == 0 {
188                a[k] = prime; // This value is irrelevant in practice. `bits = 0` returns 0 always.
189            } else {
190                // Create a mask to zero out the last k bits
191                let mask = !((1u64 << k) - 1);
192                a[k] = prime & mask;
193            }
194            k += 1;
195        }
196        a
197    };
198}
199
200impl PrimeCharacteristicRing for Goldilocks {
201    type PrimeSubfield = Self;
202
203    const ZERO: Self = Self::new(0);
204    const ONE: Self = Self::new(1);
205    const TWO: Self = Self::new(2);
206    const NEG_ONE: Self = Self::new(Self::ORDER_U64 - 1);
207
208    #[inline]
209    fn from_prime_subfield(f: Self::PrimeSubfield) -> Self {
210        f
211    }
212
213    #[inline]
214    fn from_bool(b: bool) -> Self {
215        Self::new(b.into())
216    }
217
218    #[inline]
219    fn halve(&self) -> Self {
220        Self::new(halve_u64::<P>(self.value))
221    }
222
223    #[inline]
224    fn mul_2exp_u64(&self, exp: u64) -> Self {
225        // In the Goldilocks field, 2^96 = -1 mod P and 2^192 = 1 mod P.
226        if exp < 96 {
227            *self * Self::POWERS_OF_TWO[exp as usize]
228        } else if exp < 192 {
229            -*self * Self::POWERS_OF_TWO[(exp - 96) as usize]
230        } else {
231            self.mul_2exp_u64(exp % 192)
232        }
233    }
234
235    #[inline]
236    fn div_2exp_u64(&self, mut exp: u64) -> Self {
237        // In the goldilocks field, 2^192 = 1 mod P.
238        // Thus 2^{-n} = 2^{192 - n} mod P.
239        exp %= 192;
240        self.mul_2exp_u64(192 - exp)
241    }
242
243    #[inline]
244    fn sum_array<const N: usize>(input: &[Self]) -> Self {
245        assert_eq!(N, input.len());
246        // Benchmarking shows that for N <= 3 it's faster to sum the elements directly
247        // but for N > 3 it's faster to use the .sum() methods which passes through u128's
248        // allowing for delayed reductions.
249        match N {
250            0 => Self::ZERO,
251            1 => input[0],
252            2 => input[0] + input[1],
253            3 => input[0] + input[1] + input[2],
254            _ => input.iter().copied().sum(),
255        }
256    }
257
258    #[inline]
259    fn dot_product<const N: usize>(lhs: &[Self; N], rhs: &[Self; N]) -> Self {
260        // The constant OFFSET has 2 important properties:
261        // 1. It is a multiple of P.
262        // 2. It is greater than the maximum possible value of the sum of the products of two u64s.
263        const OFFSET: u128 = ((P as u128) << 64) - (P as u128) + ((P as u128) << 32);
264        assert!((N as u32) <= (1 << 31));
265        match N {
266            0 => Self::ZERO,
267            1 => lhs[0] * rhs[0],
268            2 => {
269                // We unroll the N = 2 case as it is slightly faster and this is an important case
270                // as a major use is in extension field arithmetic and Goldilocks has a degree 2 extension.
271                let long_prod_0 = (lhs[0].value as u128) * (rhs[0].value as u128);
272                let long_prod_1 = (lhs[1].value as u128) * (rhs[1].value as u128);
273
274                // We know that long_prod_0, long_prod_1 < OFFSET.
275                // Thus if long_prod_0 + long_prod_1 overflows, we can just subtract OFFSET.
276                let (sum, over) = long_prod_0.overflowing_add(long_prod_1);
277                // Compiler really likes defining sum_corr here instead of in the if/else.
278                let sum_corr = sum.wrapping_sub(OFFSET);
279                if over {
280                    reduce128(sum_corr)
281                } else {
282                    reduce128(sum)
283                }
284            }
285            _ => {
286                let (lo_plus_hi, hi) = lhs
287                    .iter()
288                    .zip(rhs)
289                    .map(|(x, y)| (x.value as u128) * (y.value as u128))
290                    .fold((0_u128, 0_u64), |(acc_lo, acc_hi), val| {
291                        // Split val into (hi, lo) where hi is the upper 32 bits and lo is the lower 96 bits.
292                        let val_hi = (val >> 96) as u64;
293                        // acc_hi accumulates hi, acc_lo accumulates lo + 2^{96}hi.
294                        // As N <= 2^32, acc_hi cannot overflow.
295                        unsafe { (acc_lo.wrapping_add(val), acc_hi.unchecked_add(val_hi)) }
296                    });
297                // First, remove the hi part from lo_plus_hi.
298                let lo = lo_plus_hi.wrapping_sub((hi as u128) << 96);
299                // As 2^{96} = -1 mod P, we simply need to reduce lo - hi.
300                // As N <= 2^31, lo < 2^127 and hi < 2^63 < P. Hence the equation below will not over or underflow.
301                let sum = unsafe { lo.unchecked_add(P.unchecked_sub(hi) as u128) };
302                reduce128(sum)
303            }
304        }
305    }
306
307    #[inline]
308    fn zero_vec(len: usize) -> Vec<Self> {
309        // SAFETY:
310        // Due to `#[repr(transparent)]`, Goldilocks and u64 have the same size, alignment
311        // and memory layout making `flatten_to_base` safe. This this will create
312        // a vector Goldilocks elements with value set to 0.
313        unsafe { flatten_to_base(vec![0u64; len]) }
314    }
315}
316
317/// Degree of the smallest permutation polynomial for Goldilocks.
318///
319/// As p - 1 = 2^32 * 3 * 5 * 17 * ... the smallest choice for a degree D satisfying gcd(p - 1, D) = 1 is 7.
320impl InjectiveMonomial<7> for Goldilocks {}
321
322impl PermutationMonomial<7> for Goldilocks {
323    /// In the field `Goldilocks`, `a^{1/7}` is equal to a^{10540996611094048183}.
324    ///
325    /// This follows from the calculation `7*10540996611094048183 = 4*(2^64 - 2**32) + 1 = 1 mod (p - 1)`.
326    fn injective_exp_root_n(&self) -> Self {
327        exp_10540996611094048183(*self)
328    }
329}
330
331impl RawDataSerializable for Goldilocks {
332    impl_raw_serializable_primefield64!();
333}
334
335impl Field for Goldilocks {
336    // TODO: Add cfg-guarded Packing for NEON
337
338    #[cfg(all(
339        target_arch = "x86_64",
340        target_feature = "avx2",
341        not(target_feature = "avx512f")
342    ))]
343    type Packing = crate::PackedGoldilocksAVX2;
344
345    #[cfg(all(target_arch = "x86_64", target_feature = "avx512f"))]
346    type Packing = crate::PackedGoldilocksAVX512;
347    #[cfg(not(any(
348        all(
349            target_arch = "x86_64",
350            target_feature = "avx2",
351            not(target_feature = "avx512f")
352        ),
353        all(target_arch = "x86_64", target_feature = "avx512f"),
354    )))]
355    type Packing = Self;
356
357    // Sage: GF(2^64 - 2^32 + 1).multiplicative_generator()
358    const GENERATOR: Self = Self::new(7);
359
360    fn is_zero(&self) -> bool {
361        self.value == 0 || self.value == Self::ORDER_U64
362    }
363
364    fn try_inverse(&self) -> Option<Self> {
365        if self.is_zero() {
366            return None;
367        }
368
369        Some(gcd_inversion(*self))
370    }
371
372    #[inline]
373    fn order() -> BigUint {
374        P.into()
375    }
376}
377
378// We use macros to implement QuotientMap<Int> for all integer types except for u64 and i64.
379quotient_map_small_int!(Goldilocks, u64, [u8, u16, u32]);
380quotient_map_small_int!(Goldilocks, i64, [i8, i16, i32]);
381quotient_map_large_uint!(
382    Goldilocks,
383    u64,
384    Goldilocks::ORDER_U64,
385    "`[0, 2^64 - 2^32]`",
386    "`[0, 2^64 - 1]`",
387    [u128]
388);
389quotient_map_large_iint!(
390    Goldilocks,
391    i64,
392    "`[-(2^63 - 2^31), 2^63 - 2^31]`",
393    "`[1 + 2^32 - 2^64, 2^64 - 1]`",
394    [(i128, u128)]
395);
396
397impl QuotientMap<u64> for Goldilocks {
398    /// Convert a given `u64` integer into an element of the `Goldilocks` field.
399    ///
400    /// No reduction is needed as the internal value is allowed
401    /// to be any u64.
402    #[inline]
403    fn from_int(int: u64) -> Self {
404        Self::new(int)
405    }
406
407    /// Convert a given `u64` integer into an element of the `Goldilocks` field.
408    ///
409    /// Return `None` if the given integer is greater than `p = 2^64 - 2^32 + 1`.
410    #[inline]
411    fn from_canonical_checked(int: u64) -> Option<Self> {
412        (int < Self::ORDER_U64).then(|| Self::new(int))
413    }
414
415    /// Convert a given `u64` integer into an element of the `Goldilocks` field.
416    ///
417    /// # Safety
418    /// In this case this function is actually always safe as the internal
419    /// value is allowed to be any u64.
420    #[inline(always)]
421    unsafe fn from_canonical_unchecked(int: u64) -> Self {
422        Self::new(int)
423    }
424}
425
426impl QuotientMap<i64> for Goldilocks {
427    /// Convert a given `i64` integer into an element of the `Goldilocks` field.
428    ///
429    /// We simply need to deal with the sign.
430    #[inline]
431    fn from_int(int: i64) -> Self {
432        if int >= 0 {
433            Self::new(int as u64)
434        } else {
435            Self::new(Self::ORDER_U64.wrapping_add_signed(int))
436        }
437    }
438
439    /// Convert a given `i64` integer into an element of the `Goldilocks` field.
440    ///
441    /// Returns none if the input does not lie in the range `(-(2^63 - 2^31), 2^63 - 2^31)`.
442    #[inline]
443    fn from_canonical_checked(int: i64) -> Option<Self> {
444        const POS_BOUND: i64 = (P >> 1) as i64;
445        const NEG_BOUND: i64 = -POS_BOUND;
446        match int {
447            0..=POS_BOUND => Some(Self::new(int as u64)),
448            NEG_BOUND..0 => Some(Self::new(Self::ORDER_U64.wrapping_add_signed(int))),
449            _ => None,
450        }
451    }
452
453    /// Convert a given `i64` integer into an element of the `Goldilocks` field.
454    ///
455    /// # Safety
456    /// In this case this function is actually always safe as the internal
457    /// value is allowed to be any u64.
458    #[inline(always)]
459    unsafe fn from_canonical_unchecked(int: i64) -> Self {
460        Self::from_int(int)
461    }
462}
463
464impl PrimeField for Goldilocks {
465    fn as_canonical_biguint(&self) -> BigUint {
466        self.as_canonical_u64().into()
467    }
468}
469
470impl PrimeField64 for Goldilocks {
471    const ORDER_U64: u64 = P;
472
473    #[inline]
474    fn as_canonical_u64(&self) -> u64 {
475        let mut c = self.value;
476        // We only need one condition subtraction, since 2 * ORDER would not fit in a u64.
477        if c >= Self::ORDER_U64 {
478            c -= Self::ORDER_U64;
479        }
480        c
481    }
482}
483
484impl TwoAdicField for Goldilocks {
485    const TWO_ADICITY: usize = 32;
486
487    fn two_adic_generator(bits: usize) -> Self {
488        assert!(bits <= Self::TWO_ADICITY);
489        Self::TWO_ADIC_GENERATORS[bits]
490    }
491}
492
493/// A const version of the addition function.
494///
495/// Useful for constructing constants values in const contexts. Outside of
496/// const contexts, Add should be used instead.
497#[inline]
498const fn const_add(lhs: Goldilocks, rhs: Goldilocks) -> Goldilocks {
499    let (sum, over) = lhs.value.overflowing_add(rhs.value);
500    let (mut sum, over) = sum.overflowing_add((over as u64) * Goldilocks::NEG_ORDER);
501    if over {
502        sum += Goldilocks::NEG_ORDER;
503    }
504    Goldilocks::new(sum)
505}
506
507impl Add for Goldilocks {
508    type Output = Self;
509
510    #[inline]
511    fn add(self, rhs: Self) -> Self {
512        let (sum, over) = self.value.overflowing_add(rhs.value);
513        let (mut sum, over) = sum.overflowing_add(u64::from(over) * Self::NEG_ORDER);
514        if over {
515            // NB: self.value > Self::ORDER && rhs.value > Self::ORDER is necessary but not
516            // sufficient for double-overflow.
517            // This assume does two things:
518            //  1. If compiler knows that either self.value or rhs.value <= ORDER, then it can skip
519            //     this check.
520            //  2. Hints to the compiler how rare this double-overflow is (thus handled better with
521            //     a branch).
522            unsafe {
523                assume(self.value > Self::ORDER_U64 && rhs.value > Self::ORDER_U64);
524            }
525            branch_hint();
526            sum += Self::NEG_ORDER; // Cannot overflow.
527        }
528        Self::new(sum)
529    }
530}
531
532impl Sub for Goldilocks {
533    type Output = Self;
534
535    #[inline]
536    fn sub(self, rhs: Self) -> Self {
537        let (diff, under) = self.value.overflowing_sub(rhs.value);
538        let (mut diff, under) = diff.overflowing_sub(u64::from(under) * Self::NEG_ORDER);
539        if under {
540            // NB: self.value < NEG_ORDER - 1 && rhs.value > ORDER is necessary but not
541            // sufficient for double-underflow.
542            // This assume does two things:
543            //  1. If compiler knows that either self.value >= NEG_ORDER - 1 or rhs.value <= ORDER,
544            //     then it can skip this check.
545            //  2. Hints to the compiler how rare this double-underflow is (thus handled better
546            //     with a branch).
547            unsafe {
548                assume(self.value < Self::NEG_ORDER - 1 && rhs.value > Self::ORDER_U64);
549            }
550            branch_hint();
551            diff -= Self::NEG_ORDER; // Cannot underflow.
552        }
553        Self::new(diff)
554    }
555}
556
557impl Neg for Goldilocks {
558    type Output = Self;
559
560    #[inline]
561    fn neg(self) -> Self::Output {
562        Self::new(Self::ORDER_U64 - self.as_canonical_u64())
563    }
564}
565
566impl Mul for Goldilocks {
567    type Output = Self;
568
569    #[inline]
570    fn mul(self, rhs: Self) -> Self {
571        reduce128(u128::from(self.value) * u128::from(rhs.value))
572    }
573}
574
575impl_add_assign!(Goldilocks);
576impl_sub_assign!(Goldilocks);
577impl_mul_methods!(Goldilocks);
578impl_div_methods!(Goldilocks, Goldilocks);
579
580impl Sum for Goldilocks {
581    fn sum<I: Iterator<Item = Self>>(iter: I) -> Self {
582        // This is faster than iter.reduce(|x, y| x + y).unwrap_or(Self::ZERO) for iterators of length > 2.
583
584        // This sum will not overflow so long as iter.len() < 2^64.
585        let sum = iter.map(|x| x.value as u128).sum::<u128>();
586        reduce128(sum)
587    }
588}
589
590/// Reduces to a 64-bit value. The result might not be in canonical form; it could be in between the
591/// field order and `2^64`.
592#[inline]
593pub(crate) fn reduce128(x: u128) -> Goldilocks {
594    let (x_lo, x_hi) = split(x); // This is a no-op
595    let x_hi_hi = x_hi >> 32;
596    let x_hi_lo = x_hi & Goldilocks::NEG_ORDER;
597
598    let (mut t0, borrow) = x_lo.overflowing_sub(x_hi_hi);
599    if borrow {
600        branch_hint(); // A borrow is exceedingly rare. It is faster to branch.
601        t0 -= Goldilocks::NEG_ORDER; // Cannot underflow.
602    }
603    let t1 = x_hi_lo * Goldilocks::NEG_ORDER;
604    let t2 = unsafe { add_no_canonicalize_trashing_input(t0, t1) };
605    Goldilocks::new(t2)
606}
607
608#[inline]
609#[allow(clippy::cast_possible_truncation)]
610const fn split(x: u128) -> (u64, u64) {
611    (x as u64, (x >> 64) as u64)
612}
613
614/// Fast addition modulo ORDER for x86-64.
615/// This function is marked unsafe for the following reasons:
616///   - It is only correct if x + y < 2**64 + ORDER = 0x1ffffffff00000001.
617///   - It is only faster in some circumstances. In particular, on x86 it overwrites both inputs in
618///     the registers, so its use is not recommended when either input will be used again.
619#[inline(always)]
620#[cfg(target_arch = "x86_64")]
621unsafe fn add_no_canonicalize_trashing_input(x: u64, y: u64) -> u64 {
622    unsafe {
623        let res_wrapped: u64;
624        let adjustment: u64;
625        core::arch::asm!(
626            "add {0}, {1}",
627            // Trick. The carry flag is set iff the addition overflowed.
628            // sbb x, y does x := x - y - CF. In our case, x and y are both {1:e}, so it simply does
629            // {1:e} := 0xffffffff on overflow and {1:e} := 0 otherwise. {1:e} is the low 32 bits of
630            // {1}; the high 32-bits are zeroed on write. In the end, we end up with 0xffffffff in {1}
631            // on overflow; this happens be NEG_ORDER.
632            // Note that the CPU does not realize that the result of sbb x, x does not actually depend
633            // on x. We must write the result to a register that we know to be ready. We have a
634            // dependency on {1} anyway, so let's use it.
635            "sbb {1:e}, {1:e}",
636            inlateout(reg) x => res_wrapped,
637            inlateout(reg) y => adjustment,
638            options(pure, nomem, nostack),
639        );
640        assume(x != 0 || (res_wrapped == y && adjustment == 0));
641        assume(y != 0 || (res_wrapped == x && adjustment == 0));
642        // Add NEG_ORDER == subtract ORDER.
643        // Cannot overflow unless the assumption if x + y < 2**64 + ORDER is incorrect.
644        res_wrapped + adjustment
645    }
646}
647
648#[inline(always)]
649#[cfg(not(target_arch = "x86_64"))]
650unsafe fn add_no_canonicalize_trashing_input(x: u64, y: u64) -> u64 {
651    let (res_wrapped, carry) = x.overflowing_add(y);
652    // Below cannot overflow unless the assumption if x + y < 2**64 + ORDER is incorrect.
653    res_wrapped + Goldilocks::NEG_ORDER * u64::from(carry)
654}
655
656/// Compute the inverse of a Goldilocks element `a` using the binary GCD algorithm.
657///
658/// Instead of applying the standard algorithm this uses a variant inspired by https://eprint.iacr.org/2020/972.pdf.
659/// The key idea is to compute update factors which are incorrect by a known power of 2 which
660/// can be corrected at the end. These update factors can then be used to construct the inverse
661/// via a simple linear combination.
662///
663/// This is much faster than the standard algorithm as we avoid most of the (more expensive) field arithmetic.
664fn gcd_inversion(input: Goldilocks) -> Goldilocks {
665    // Initialise our values to the value we want to invert and the prime.
666    let (mut a, mut b) = (input.value, P);
667
668    // As the goldilocks prime is 64 bit, initially `len(a) + len(b) ≤ 2 * 64 = 128`.
669    // This means we will need `126` iterations of the inner loop ensure `len(a) + len(b) ≤ 2`.
670    // We split the iterations into 2 rounds of length 63.
671    const ROUND_SIZE: usize = 63;
672
673    // In theory we could make this slightly faster by replacing the first `gcd_inner` by a copy-pasted
674    // version which doesn't do any computations involving g. But either the compiler works this out
675    // for itself or the speed up is negligible as I couldn't notice any difference in benchmarks.
676    let (f00, _, f10, _) = gcd_inner::<ROUND_SIZE>(&mut a, &mut b);
677    let (_, _, f11, g11) = gcd_inner::<ROUND_SIZE>(&mut a, &mut b);
678
679    // The update factors are i64's except we need to interpret -2^63 as 2^63.
680    // This is because the outputs of `gcd_inner` are always in the range `(-2^ROUND_SIZE, 2^ROUND_SIZE]`.
681    let u = from_unusual_int(f00);
682    let v = from_unusual_int(f10);
683    let u_fac11 = from_unusual_int(f11);
684    let v_fac11 = from_unusual_int(g11);
685
686    // Each iteration introduced a factor of 2 and so we need to divide by 2^{126}.
687    // But 2^{192} = 1 mod P, so we can instead multiply by 2^{66} as 192 - 126 = 66.
688    (u * u_fac11 + v * v_fac11).mul_2exp_u64(66)
689}
690
691/// Convert from an i64 to a Goldilocks element but interpret -2^63 as 2^63.
692const fn from_unusual_int(int: i64) -> Goldilocks {
693    if (int >= 0) || (int == i64::MIN) {
694        Goldilocks::new(int as u64)
695    } else {
696        Goldilocks::new(Goldilocks::ORDER_U64.wrapping_add_signed(int))
697    }
698}
699
700#[cfg(test)]
701mod tests {
702    use p3_field::extension::BinomialExtensionField;
703    use p3_field_testing::{
704        test_field, test_field_dft, test_prime_field, test_prime_field_64, test_two_adic_field,
705    };
706
707    use super::*;
708
709    type F = Goldilocks;
710    type EF = BinomialExtensionField<F, 5>;
711
712    #[test]
713    fn test_goldilocks() {
714        let f = F::new(100);
715        assert_eq!(f.as_canonical_u64(), 100);
716
717        // Over the Goldilocks field, the following set of equations hold
718        // p               = 0
719        // 2^64 - 2^32 + 1 = 0
720        // 2^64            = 2^32 - 1
721        let f = F::new(u64::MAX);
722        assert_eq!(f.as_canonical_u64(), u32::MAX as u64 - 1);
723
724        let f = F::from_u64(u64::MAX);
725        assert_eq!(f.as_canonical_u64(), u32::MAX as u64 - 1);
726
727        // Generator check
728        let expected_multiplicative_group_generator = F::new(7);
729        assert_eq!(F::GENERATOR, expected_multiplicative_group_generator);
730        assert_eq!(F::GENERATOR.as_canonical_u64(), 7_u64);
731
732        // Check on `reduce_u128`
733        let x = u128::MAX;
734        let y = reduce128(x);
735        // The following equality sequence holds, modulo p = 2^64 - 2^32 + 1
736        // 2^128 - 1 = (2^64 - 1) * (2^64 + 1)
737        //           = (2^32 - 1 - 1) * (2^32 - 1 + 1)
738        //           = (2^32 - 2) * (2^32)
739        //           = 2^64 - 2 * 2^32
740        //           = 2^64 - 2^33
741        //           = 2^32 - 1 - 2^33
742        //           = - 2^32 - 1
743        let expected_result = -F::TWO.exp_power_of_2(5) - F::ONE;
744        assert_eq!(y, expected_result);
745
746        let f = F::new(100);
747        assert_eq!(f.injective_exp_n().injective_exp_root_n(), f);
748        assert_eq!(y.injective_exp_n().injective_exp_root_n(), y);
749        assert_eq!(F::TWO.injective_exp_n().injective_exp_root_n(), F::TWO);
750    }
751
752    // Goldilocks has a redundant representation for both 0 and 1.
753    const ZEROS: [Goldilocks; 2] = [Goldilocks::ZERO, Goldilocks::new(P)];
754    const ONES: [Goldilocks; 2] = [Goldilocks::ONE, Goldilocks::new(P + 1)];
755
756    // Get the prime factorization of the order of the multiplicative group.
757    // i.e. the prime factorization of P - 1.
758    fn multiplicative_group_prime_factorization() -> [(BigUint, u32); 6] {
759        [
760            (BigUint::from(2u8), 32),
761            (BigUint::from(3u8), 1),
762            (BigUint::from(5u8), 1),
763            (BigUint::from(17u8), 1),
764            (BigUint::from(257u16), 1),
765            (BigUint::from(65537u32), 1),
766        ]
767    }
768
769    test_field!(
770        crate::Goldilocks,
771        &super::ZEROS,
772        &super::ONES,
773        &super::multiplicative_group_prime_factorization()
774    );
775    test_prime_field!(crate::Goldilocks);
776    test_prime_field_64!(crate::Goldilocks, &super::ZEROS, &super::ONES);
777    test_two_adic_field!(crate::Goldilocks);
778
779    test_field_dft!(
780        radix2dit,
781        crate::Goldilocks,
782        super::EF,
783        p3_dft::Radix2Dit<_>
784    );
785    test_field_dft!(bowers, crate::Goldilocks, super::EF, p3_dft::Radix2Bowers);
786    test_field_dft!(
787        parallel,
788        crate::Goldilocks,
789        super::EF,
790        p3_dft::Radix2DitParallel<crate::Goldilocks>
791    );
792}