Skip to main content

p3_goldilocks/
goldilocks.rs

1use alloc::vec;
2use alloc::vec::Vec;
3use core::fmt::{Debug, Display, Formatter};
4use core::hash::{Hash, Hasher};
5use core::iter::{Product, Sum};
6use core::ops::{Add, AddAssign, Div, DivAssign, Mul, MulAssign, Neg, Sub, SubAssign};
7use core::{array, fmt};
8
9use num_bigint::BigUint;
10use p3_challenger::UniformSamplingField;
11use p3_field::exponentiation::exp_10540996611094048183;
12use p3_field::integers::QuotientMap;
13use p3_field::op_assign_macros::{
14    impl_add_assign, impl_div_methods, impl_mul_methods, impl_sub_assign,
15};
16use p3_field::{
17    Field, InjectiveMonomial, Packable, PermutationMonomial, PrimeCharacteristicRing, PrimeField,
18    PrimeField64, RawDataSerializable, TwoAdicField, impl_raw_serializable_primefield64,
19    quotient_map_large_iint, quotient_map_large_uint, quotient_map_small_int,
20};
21use p3_util::{assume, branch_hint, flatten_to_base, gcd_inner};
22use rand::Rng;
23use rand::distr::{Distribution, StandardUniform};
24use serde::{Deserialize, Serialize};
25
26/// The Goldilocks prime
27pub(crate) const P: u64 = 0xFFFF_FFFF_0000_0001;
28
29/// The prime field known as Goldilocks, defined as `F_p` where `p = 2^64 - 2^32 + 1`.
30///
31/// Note that the safety of deriving `Serialize` and `Deserialize` relies on the fact that the internal value can be any u64.
32#[derive(Copy, Clone, Default, Serialize, Deserialize)]
33#[repr(transparent)] // Important for reasoning about memory layout
34#[must_use]
35pub struct Goldilocks {
36    /// Not necessarily canonical.
37    pub(crate) value: u64,
38}
39
40impl Goldilocks {
41    /// Create a new field element from any `u64`.
42    ///
43    /// Any `u64` value is accepted. No reduction is performed since
44    /// Goldilocks uses a non-canonical internal representation.
45    #[inline]
46    pub const fn new(value: u64) -> Self {
47        Self { value }
48    }
49
50    /// Convert a `[u64; N]` array to an array of field elements.
51    ///
52    /// Const version of `input.map(Goldilocks::new)`.
53    #[inline]
54    pub const fn new_array<const N: usize>(input: [u64; N]) -> [Self; N] {
55        let mut output = [Self::ZERO; N];
56        let mut i = 0;
57        while i < N {
58            output[i].value = input[i];
59            i += 1;
60        }
61        output
62    }
63
64    /// Convert a `[[u64; N]; M]` array to a 2D array of field elements.
65    ///
66    /// Const version of `input.map(Goldilocks::new_array)`.
67    #[inline]
68    pub const fn new_2d_array<const N: usize, const M: usize>(
69        input: [[u64; N]; M],
70    ) -> [[Self; N]; M] {
71        let mut output = [[Self::ZERO; N]; M];
72        let mut i = 0;
73        while i < M {
74            output[i] = Self::new_array(input[i]);
75            i += 1;
76        }
77        output
78    }
79
80    /// Two's complement of `ORDER`, i.e. `2^64 - ORDER = 2^32 - 1`.
81    const NEG_ORDER: u64 = Self::ORDER_U64.wrapping_neg();
82
83    /// A list of generators for the two-adic subgroups of the goldilocks field.
84    ///
85    /// These satisfy the properties that `TWO_ADIC_GENERATORS[0] = 1` and `TWO_ADIC_GENERATORS[i+1]^2 = TWO_ADIC_GENERATORS[i]`.
86    pub const TWO_ADIC_GENERATORS: [Self; 33] = Self::new_array([
87        0x0000000000000001,
88        0xffffffff00000000,
89        0x0001000000000000,
90        0xfffffffeff000001,
91        0xefffffff00000001,
92        0x00003fffffffc000,
93        0x0000008000000000,
94        0xf80007ff08000001,
95        0xbf79143ce60ca966,
96        0x1905d02a5c411f4e,
97        0x9d8f2ad78bfed972,
98        0x0653b4801da1c8cf,
99        0xf2c35199959dfcb6,
100        0x1544ef2335d17997,
101        0xe0ee099310bba1e2,
102        0xf6b2cffe2306baac,
103        0x54df9630bf79450e,
104        0xabd0a6e8aa3d8a0e,
105        0x81281a7b05f9beac,
106        0xfbd41c6b8caa3302,
107        0x30ba2ecd5e93e76d,
108        0xf502aef532322654,
109        0x4b2a18ade67246b5,
110        0xea9d5a1336fbc98b,
111        0x86cdcc31c307e171,
112        0x4bbaf5976ecfefd8,
113        0xed41d05b78d6e286,
114        0x10d78dd8915a171d,
115        0x59049500004a4485,
116        0xdfa8c93ba46d2666,
117        0x7e9bd009b86a0845,
118        0x400a7f755588e659,
119        0x185629dcda58878c,
120    ]);
121
122    /// A list of powers of two from 0 to 95.
123    ///
124    /// Note that 2^{96} = -1 mod P so all powers of two can be simply
125    /// derived from this list.
126    const POWERS_OF_TWO: [Self; 96] = {
127        let mut powers_of_two = [Self::ONE; 96];
128
129        let mut i = 1;
130        while i < 64 {
131            powers_of_two[i] = Self::new(1 << i);
132            i += 1;
133        }
134        let mut var = Self::new(1 << 63);
135        while i < 96 {
136            var = const_add(var, var);
137            powers_of_two[i] = var;
138            i += 1;
139        }
140        powers_of_two
141    };
142}
143
144impl PartialEq for Goldilocks {
145    fn eq(&self, other: &Self) -> bool {
146        self.as_canonical_u64() == other.as_canonical_u64()
147    }
148}
149
150impl Eq for Goldilocks {}
151
152impl Packable for Goldilocks {}
153
154impl Hash for Goldilocks {
155    fn hash<H: Hasher>(&self, state: &mut H) {
156        state.write_u64(self.as_canonical_u64());
157    }
158}
159
160impl Ord for Goldilocks {
161    fn cmp(&self, other: &Self) -> core::cmp::Ordering {
162        self.as_canonical_u64().cmp(&other.as_canonical_u64())
163    }
164}
165
166impl PartialOrd for Goldilocks {
167    fn partial_cmp(&self, other: &Self) -> Option<core::cmp::Ordering> {
168        Some(self.cmp(other))
169    }
170}
171
172impl Display for Goldilocks {
173    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
174        Display::fmt(&self.as_canonical_u64(), f)
175    }
176}
177
178impl Debug for Goldilocks {
179    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
180        Debug::fmt(&self.as_canonical_u64(), f)
181    }
182}
183
184impl Distribution<Goldilocks> for StandardUniform {
185    fn sample<R: Rng + ?Sized>(&self, rng: &mut R) -> Goldilocks {
186        loop {
187            let next_u64 = rng.next_u64();
188            let is_canonical = next_u64 < Goldilocks::ORDER_U64;
189            if is_canonical {
190                return Goldilocks::new(next_u64);
191            }
192        }
193    }
194}
195
196impl UniformSamplingField for Goldilocks {
197    const MAX_SINGLE_SAMPLE_BITS: usize = 24;
198    const SAMPLING_BITS_M: [u64; 64] = {
199        let prime: u64 = P;
200        let mut a = [0u64; 64];
201        let mut k = 0;
202        while k < 64 {
203            if k == 0 {
204                a[k] = prime; // This value is irrelevant in practice. `bits = 0` returns 0 always.
205            } else {
206                // Create a mask to zero out the last k bits
207                let mask = !((1u64 << k) - 1);
208                a[k] = prime & mask;
209            }
210            k += 1;
211        }
212        a
213    };
214}
215
216impl PrimeCharacteristicRing for Goldilocks {
217    type PrimeSubfield = Self;
218
219    const ZERO: Self = Self::new(0);
220    const ONE: Self = Self::new(1);
221    const TWO: Self = Self::new(2);
222    const NEG_ONE: Self = Self::new(Self::ORDER_U64 - 1);
223
224    #[inline]
225    fn from_prime_subfield(f: Self::PrimeSubfield) -> Self {
226        f
227    }
228
229    #[inline]
230    fn from_bool(b: bool) -> Self {
231        Self::new(b.into())
232    }
233
234    #[inline]
235    fn halve(&self) -> Self {
236        // Branchless halving: x/2 = (x >> 1) + ((x & 1) * (p+1)/2).
237        // When x is odd, add (p+1)/2 to compensate for the lost bit.
238        // Uses mask arithmetic to avoid the 50/50 unpredictable branch.
239        const HALF_P_PLUS_1: u64 = (P + 1) >> 1; // 0x7FFFFFFF80000001
240        let lo_bit = self.value & 1;
241        let half = self.value >> 1;
242        let mask = 0u64.wrapping_sub(lo_bit); // all-ones when odd, zero when even
243        Self::new(half.wrapping_add(mask & HALF_P_PLUS_1))
244    }
245
246    #[inline]
247    fn mul_2exp_u64(&self, exp: u64) -> Self {
248        // In the Goldilocks field, 2^96 = -1 mod P and 2^192 = 1 mod P.
249        if exp < 96 {
250            *self * Self::POWERS_OF_TWO[exp as usize]
251        } else if exp < 192 {
252            -*self * Self::POWERS_OF_TWO[(exp - 96) as usize]
253        } else {
254            self.mul_2exp_u64(exp % 192)
255        }
256    }
257
258    #[inline]
259    fn div_2exp_u64(&self, mut exp: u64) -> Self {
260        // In the goldilocks field, 2^192 = 1 mod P.
261        // Thus 2^{-n} = 2^{192 - n} mod P.
262        exp %= 192;
263        match exp {
264            0 => *self,
265            1 => self.halve(),
266            _ => self.mul_2exp_u64(192 - exp),
267        }
268    }
269
270    #[inline]
271    fn sum_array<const N: usize>(input: &[Self]) -> Self {
272        assert_eq!(N, input.len());
273        // Benchmarking shows that for N <= 3 it's faster to sum the elements directly
274        // but for N > 3 it's faster to use the .sum() methods which passes through u128's
275        // allowing for delayed reductions.
276        match N {
277            0 => Self::ZERO,
278            1 => input[0],
279            2 => input[0] + input[1],
280            3 => input[0] + input[1] + input[2],
281            _ => input.iter().copied().sum(),
282        }
283    }
284
285    #[inline]
286    fn dot_product<const N: usize>(lhs: &[Self; N], rhs: &[Self; N]) -> Self {
287        // The constant OFFSET has 2 important properties:
288        // 1. It is a multiple of P.
289        // 2. It is greater than the maximum possible value of the sum of the products of two u64s.
290        const OFFSET: u128 = ((P as u128) << 64) - (P as u128) + ((P as u128) << 32);
291        assert!((N as u32) <= (1 << 31));
292        match N {
293            0 => Self::ZERO,
294            1 => lhs[0] * rhs[0],
295            2 => {
296                // We unroll the N = 2 case as it is slightly faster and this is an important case
297                // as a major use is in extension field arithmetic and Goldilocks has a degree 2 extension.
298                let long_prod_0 = (lhs[0].value as u128) * (rhs[0].value as u128);
299                let long_prod_1 = (lhs[1].value as u128) * (rhs[1].value as u128);
300
301                // We know that long_prod_0, long_prod_1 < OFFSET.
302                // Thus if long_prod_0 + long_prod_1 overflows, we can just subtract OFFSET.
303                let (sum, over) = long_prod_0.overflowing_add(long_prod_1);
304                // Compiler really likes defining sum_corr here instead of in the if/else.
305                let sum_corr = sum.wrapping_sub(OFFSET);
306                if over {
307                    reduce128(sum_corr)
308                } else {
309                    reduce128(sum)
310                }
311            }
312            _ => {
313                let (lo_plus_hi, hi) = lhs
314                    .iter()
315                    .zip(rhs)
316                    .map(|(x, y)| (x.value as u128) * (y.value as u128))
317                    .fold((0_u128, 0_u64), |(acc_lo, acc_hi), val| {
318                        // Split val into (hi, lo) where hi is the upper 32 bits and lo is the lower 96 bits.
319                        let val_hi = (val >> 96) as u64;
320                        // acc_hi accumulates hi, acc_lo accumulates lo + 2^{96}hi.
321                        // As N <= 2^32, acc_hi cannot overflow.
322                        unsafe { (acc_lo.wrapping_add(val), acc_hi.unchecked_add(val_hi)) }
323                    });
324                // First, remove the hi part from lo_plus_hi.
325                let lo = lo_plus_hi.wrapping_sub((hi as u128) << 96);
326                // As 2^{96} = -1 mod P, we simply need to reduce lo - hi.
327                // As N <= 2^31, lo < 2^127 and hi < 2^63 < P. Hence the equation below will not over or underflow.
328                let sum = unsafe { lo.unchecked_add(P.unchecked_sub(hi) as u128) };
329                reduce128(sum)
330            }
331        }
332    }
333
334    #[inline]
335    fn zero_vec(len: usize) -> Vec<Self> {
336        // SAFETY:
337        // Due to `#[repr(transparent)]`, Goldilocks and u64 have the same size, alignment
338        // and memory layout making `flatten_to_base` safe. This will create
339        // a vector of Goldilocks elements with value set to 0.
340        unsafe { flatten_to_base(vec![0u64; len]) }
341    }
342}
343
344/// Degree of the smallest permutation polynomial for Goldilocks.
345///
346/// As p - 1 = 2^32 * 3 * 5 * 17 * ... the smallest choice for a degree D satisfying gcd(p - 1, D) = 1 is 7.
347impl InjectiveMonomial<7> for Goldilocks {}
348
349impl PermutationMonomial<7> for Goldilocks {
350    /// In the field `Goldilocks`, `a^{1/7}` is equal to a^{10540996611094048183}.
351    ///
352    /// This follows from the calculation `7*10540996611094048183 = 4*(2^64 - 2**32) + 1 = 1 mod (p - 1)`.
353    fn injective_exp_root_n(&self) -> Self {
354        exp_10540996611094048183(*self)
355    }
356}
357
358impl RawDataSerializable for Goldilocks {
359    impl_raw_serializable_primefield64!();
360}
361
362impl Field for Goldilocks {
363    #[cfg(all(
364        target_arch = "x86_64",
365        target_feature = "avx2",
366        not(target_feature = "avx512f")
367    ))]
368    type Packing = crate::PackedGoldilocksAVX2;
369
370    #[cfg(all(target_arch = "x86_64", target_feature = "avx512f"))]
371    type Packing = crate::PackedGoldilocksAVX512;
372
373    #[cfg(target_arch = "aarch64")]
374    type Packing = crate::PackedGoldilocksNeon;
375
376    #[cfg(not(any(
377        all(
378            target_arch = "x86_64",
379            target_feature = "avx2",
380            not(target_feature = "avx512f")
381        ),
382        all(target_arch = "x86_64", target_feature = "avx512f"),
383        target_arch = "aarch64",
384    )))]
385    type Packing = Self;
386
387    // Sage: GF(2^64 - 2^32 + 1).multiplicative_generator()
388    const GENERATOR: Self = Self::new(7);
389
390    fn is_zero(&self) -> bool {
391        self.value == 0 || self.value == Self::ORDER_U64
392    }
393
394    fn try_inverse(&self) -> Option<Self> {
395        if self.is_zero() {
396            return None;
397        }
398
399        Some(gcd_inversion(*self))
400    }
401
402    #[inline]
403    fn order() -> BigUint {
404        P.into()
405    }
406}
407
408// We use macros to implement QuotientMap<Int> for all integer types except for u64 and i64.
409quotient_map_small_int!(Goldilocks, u64, [u8, u16, u32]);
410quotient_map_small_int!(Goldilocks, i64, [i8, i16, i32]);
411quotient_map_large_uint!(
412    Goldilocks,
413    u64,
414    Goldilocks::ORDER_U64,
415    "`[0, 2^64 - 2^32]`",
416    "`[0, 2^64 - 1]`",
417    [u128]
418);
419quotient_map_large_iint!(
420    Goldilocks,
421    i64,
422    "`[-(2^63 - 2^31), 2^63 - 2^31]`",
423    "`[1 + 2^32 - 2^64, 2^64 - 1]`",
424    [(i128, u128)]
425);
426
427impl QuotientMap<u64> for Goldilocks {
428    /// Convert a given `u64` integer into an element of the `Goldilocks` field.
429    ///
430    /// No reduction is needed as the internal value is allowed
431    /// to be any u64.
432    #[inline]
433    fn from_int(int: u64) -> Self {
434        Self::new(int)
435    }
436
437    /// Convert a given `u64` integer into an element of the `Goldilocks` field.
438    ///
439    /// Return `None` if the given integer is greater than `p = 2^64 - 2^32 + 1`.
440    #[inline]
441    fn from_canonical_checked(int: u64) -> Option<Self> {
442        (int < Self::ORDER_U64).then(|| Self::new(int))
443    }
444
445    /// Convert a given `u64` integer into an element of the `Goldilocks` field.
446    ///
447    /// # Safety
448    /// In this case this function is actually always safe as the internal
449    /// value is allowed to be any u64.
450    #[inline(always)]
451    unsafe fn from_canonical_unchecked(int: u64) -> Self {
452        Self::new(int)
453    }
454}
455
456impl QuotientMap<i64> for Goldilocks {
457    /// Convert a given `i64` integer into an element of the `Goldilocks` field.
458    ///
459    /// We simply need to deal with the sign.
460    #[inline]
461    fn from_int(int: i64) -> Self {
462        if int >= 0 {
463            Self::new(int as u64)
464        } else {
465            Self::new(Self::ORDER_U64.wrapping_add_signed(int))
466        }
467    }
468
469    /// Convert a given `i64` integer into an element of the `Goldilocks` field.
470    ///
471    /// Returns none if the input does not lie in the range `(-(2^63 - 2^31), 2^63 - 2^31)`.
472    #[inline]
473    fn from_canonical_checked(int: i64) -> Option<Self> {
474        const POS_BOUND: i64 = (P >> 1) as i64;
475        const NEG_BOUND: i64 = -POS_BOUND;
476        match int {
477            0..=POS_BOUND => Some(Self::new(int as u64)),
478            NEG_BOUND..0 => Some(Self::new(Self::ORDER_U64.wrapping_add_signed(int))),
479            _ => None,
480        }
481    }
482
483    /// Convert a given `i64` integer into an element of the `Goldilocks` field.
484    ///
485    /// # Safety
486    /// In this case this function is actually always safe as the internal
487    /// value is allowed to be any u64.
488    #[inline(always)]
489    unsafe fn from_canonical_unchecked(int: i64) -> Self {
490        Self::from_int(int)
491    }
492}
493
494impl PrimeField for Goldilocks {
495    fn as_canonical_biguint(&self) -> BigUint {
496        self.as_canonical_u64().into()
497    }
498}
499
500impl PrimeField64 for Goldilocks {
501    const ORDER_U64: u64 = P;
502
503    #[inline]
504    fn as_canonical_u64(&self) -> u64 {
505        let mut c = self.value;
506        // We only need one condition subtraction, since 2 * ORDER would not fit in a u64.
507        if c >= Self::ORDER_U64 {
508            c -= Self::ORDER_U64;
509        }
510        c
511    }
512}
513
514impl TwoAdicField for Goldilocks {
515    const TWO_ADICITY: usize = 32;
516
517    fn two_adic_generator(bits: usize) -> Self {
518        assert!(bits <= Self::TWO_ADICITY);
519        Self::TWO_ADIC_GENERATORS[bits]
520    }
521}
522
523/// A const version of the addition function.
524///
525/// Useful for constructing constants values in const contexts. Outside of
526/// const contexts, Add should be used instead.
527#[inline]
528const fn const_add(lhs: Goldilocks, rhs: Goldilocks) -> Goldilocks {
529    let (sum, over) = lhs.value.overflowing_add(rhs.value);
530    let (mut sum, over) = sum.overflowing_add((over as u64) * Goldilocks::NEG_ORDER);
531    if over {
532        sum += Goldilocks::NEG_ORDER;
533    }
534    Goldilocks::new(sum)
535}
536
537impl Add for Goldilocks {
538    type Output = Self;
539
540    #[inline]
541    fn add(self, rhs: Self) -> Self {
542        let (sum, over) = self.value.overflowing_add(rhs.value);
543        let (mut sum, over) = sum.overflowing_add(u64::from(over) * Self::NEG_ORDER);
544        if over {
545            // NB: self.value > Self::ORDER && rhs.value > Self::ORDER is necessary but not
546            // sufficient for double-overflow.
547            // This assume does two things:
548            //  1. If compiler knows that either self.value or rhs.value <= ORDER, then it can skip
549            //     this check.
550            //  2. Hints to the compiler how rare this double-overflow is (thus handled better with
551            //     a branch).
552            unsafe {
553                assume(self.value > Self::ORDER_U64 && rhs.value > Self::ORDER_U64);
554            }
555            branch_hint();
556            sum += Self::NEG_ORDER; // Cannot overflow.
557        }
558        Self::new(sum)
559    }
560}
561
562impl Sub for Goldilocks {
563    type Output = Self;
564
565    #[inline]
566    fn sub(self, rhs: Self) -> Self {
567        let (diff, under) = self.value.overflowing_sub(rhs.value);
568        let (mut diff, under) = diff.overflowing_sub(u64::from(under) * Self::NEG_ORDER);
569        if under {
570            // NB: self.value < NEG_ORDER - 1 && rhs.value > ORDER is necessary but not
571            // sufficient for double-underflow.
572            // This assume does two things:
573            //  1. If compiler knows that either self.value >= NEG_ORDER - 1 or rhs.value <= ORDER,
574            //     then it can skip this check.
575            //  2. Hints to the compiler how rare this double-underflow is (thus handled better
576            //     with a branch).
577            unsafe {
578                assume(self.value < Self::NEG_ORDER - 1 && rhs.value > Self::ORDER_U64);
579            }
580            branch_hint();
581            diff -= Self::NEG_ORDER; // Cannot underflow.
582        }
583        Self::new(diff)
584    }
585}
586
587impl Neg for Goldilocks {
588    type Output = Self;
589
590    #[inline]
591    fn neg(self) -> Self::Output {
592        Self::new(Self::ORDER_U64 - self.as_canonical_u64())
593    }
594}
595
596impl Mul for Goldilocks {
597    type Output = Self;
598
599    #[inline]
600    fn mul(self, rhs: Self) -> Self {
601        reduce128(u128::from(self.value) * u128::from(rhs.value))
602    }
603}
604
605impl_add_assign!(Goldilocks);
606impl_sub_assign!(Goldilocks);
607impl_mul_methods!(Goldilocks);
608impl_div_methods!(Goldilocks, Goldilocks);
609
610impl Sum for Goldilocks {
611    fn sum<I: Iterator<Item = Self>>(iter: I) -> Self {
612        // This is faster than iter.reduce(|x, y| x + y).unwrap_or(Self::ZERO) for iterators of length > 2.
613
614        // This sum will not overflow so long as iter.len() < 2^64.
615        let sum = iter.map(|x| x.value as u128).sum::<u128>();
616        reduce128(sum)
617    }
618}
619
620/// Reduces to a 64-bit value. The result might not be in canonical form; it could be in between the
621/// field order and `2^64`.
622#[inline]
623pub(crate) fn reduce128(x: u128) -> Goldilocks {
624    let (x_lo, x_hi) = split(x); // This is a no-op
625    let x_hi_hi = x_hi >> 32;
626    let x_hi_lo = x_hi & Goldilocks::NEG_ORDER;
627
628    let (mut t0, borrow) = x_lo.overflowing_sub(x_hi_hi);
629    if borrow {
630        branch_hint(); // A borrow is exceedingly rare. It is faster to branch.
631        t0 -= Goldilocks::NEG_ORDER; // Cannot underflow.
632    }
633    let t1 = x_hi_lo * Goldilocks::NEG_ORDER;
634    let t2 = unsafe { add_no_canonicalize_trashing_input(t0, t1) };
635    Goldilocks::new(t2)
636}
637
638#[inline]
639#[allow(clippy::cast_possible_truncation)]
640const fn split(x: u128) -> (u64, u64) {
641    (x as u64, (x >> 64) as u64)
642}
643
644/// Fast addition modulo ORDER for x86-64.
645/// This function is marked unsafe for the following reasons:
646///   - It is only correct if x + y < 2**64 + ORDER = 0x1ffffffff00000001.
647///   - It is only faster in some circumstances. In particular, on x86 it overwrites both inputs in
648///     the registers, so its use is not recommended when either input will be used again.
649#[inline(always)]
650#[cfg(target_arch = "x86_64")]
651unsafe fn add_no_canonicalize_trashing_input(x: u64, y: u64) -> u64 {
652    unsafe {
653        let res_wrapped: u64;
654        let adjustment: u64;
655        core::arch::asm!(
656            "add {0}, {1}",
657            // Trick. The carry flag is set iff the addition overflowed.
658            // sbb x, y does x := x - y - CF. In our case, x and y are both {1:e}, so it simply does
659            // {1:e} := 0xffffffff on overflow and {1:e} := 0 otherwise. {1:e} is the low 32 bits of
660            // {1}; the high 32-bits are zeroed on write. In the end, we end up with 0xffffffff in {1}
661            // on overflow; this happens be NEG_ORDER.
662            // Note that the CPU does not realize that the result of sbb x, x does not actually depend
663            // on x. We must write the result to a register that we know to be ready. We have a
664            // dependency on {1} anyway, so let's use it.
665            "sbb {1:e}, {1:e}",
666            inlateout(reg) x => res_wrapped,
667            inlateout(reg) y => adjustment,
668            options(pure, nomem, nostack),
669        );
670        assume(x != 0 || (res_wrapped == y && adjustment == 0));
671        assume(y != 0 || (res_wrapped == x && adjustment == 0));
672        // Add NEG_ORDER == subtract ORDER.
673        // Cannot overflow unless the assumption if x + y < 2**64 + ORDER is incorrect.
674        res_wrapped + adjustment
675    }
676}
677
678#[inline(always)]
679#[cfg(not(target_arch = "x86_64"))]
680unsafe fn add_no_canonicalize_trashing_input(x: u64, y: u64) -> u64 {
681    let (res_wrapped, carry) = x.overflowing_add(y);
682    // Below cannot overflow unless the assumption if x + y < 2**64 + ORDER is incorrect.
683    res_wrapped + Goldilocks::NEG_ORDER * u64::from(carry)
684}
685
686/// Compute the inverse of a Goldilocks element `a` using the binary GCD algorithm.
687///
688/// Instead of applying the standard algorithm this uses a variant inspired by https://eprint.iacr.org/2020/972.pdf.
689/// The key idea is to compute update factors which are incorrect by a known power of 2 which
690/// can be corrected at the end. These update factors can then be used to construct the inverse
691/// via a simple linear combination.
692///
693/// This is much faster than the standard algorithm as we avoid most of the (more expensive) field arithmetic.
694fn gcd_inversion(input: Goldilocks) -> Goldilocks {
695    // Initialise our values to the value we want to invert and the prime.
696    let (mut a, mut b) = (input.value, P);
697
698    // As the goldilocks prime is 64 bit, initially `len(a) + len(b) ≤ 2 * 64 = 128`.
699    // This means we will need `126` iterations of the inner loop ensure `len(a) + len(b) ≤ 2`.
700    // We split the iterations into 2 rounds of length 63.
701    const ROUND_SIZE: usize = 63;
702
703    // In theory we could make this slightly faster by replacing the first `gcd_inner` by a copy-pasted
704    // version which doesn't do any computations involving g. But either the compiler works this out
705    // for itself or the speed up is negligible as I couldn't notice any difference in benchmarks.
706    let (f00, _, f10, _) = gcd_inner::<ROUND_SIZE>(&mut a, &mut b);
707    let (_, _, f11, g11) = gcd_inner::<ROUND_SIZE>(&mut a, &mut b);
708
709    // The update factors are i64's except we need to interpret -2^63 as 2^63.
710    // This is because the outputs of `gcd_inner` are always in the range `(-2^ROUND_SIZE, 2^ROUND_SIZE]`.
711    let u = from_unusual_int(f00);
712    let v = from_unusual_int(f10);
713    let u_fac11 = from_unusual_int(f11);
714    let v_fac11 = from_unusual_int(g11);
715
716    // Each iteration introduced a factor of 2 and so we need to divide by 2^{126}.
717    // But 2^{192} = 1 mod P, so we can instead multiply by 2^{66} as 192 - 126 = 66.
718    (u * u_fac11 + v * v_fac11).mul_2exp_u64(66)
719}
720
721/// Convert from an i64 to a Goldilocks element but interpret -2^63 as 2^63.
722const fn from_unusual_int(int: i64) -> Goldilocks {
723    if (int >= 0) || (int == i64::MIN) {
724        Goldilocks::new(int as u64)
725    } else {
726        Goldilocks::new(Goldilocks::ORDER_U64.wrapping_add_signed(int))
727    }
728}
729
730#[cfg(test)]
731mod tests {
732    use p3_field::extension::BinomialExtensionField;
733    use p3_field_testing::{
734        test_field, test_field_dft, test_prime_field, test_prime_field_64, test_two_adic_field,
735    };
736
737    use super::*;
738
739    type F = Goldilocks;
740    type EF = BinomialExtensionField<F, 5>;
741
742    #[test]
743    fn test_goldilocks() {
744        let f = F::new(100);
745        assert_eq!(f.as_canonical_u64(), 100);
746
747        // Over the Goldilocks field, the following set of equations hold
748        // p               = 0
749        // 2^64 - 2^32 + 1 = 0
750        // 2^64            = 2^32 - 1
751        let f = F::new(u64::MAX);
752        assert_eq!(f.as_canonical_u64(), u32::MAX as u64 - 1);
753
754        let f = F::from_u64(u64::MAX);
755        assert_eq!(f.as_canonical_u64(), u32::MAX as u64 - 1);
756
757        // Generator check
758        let expected_multiplicative_group_generator = F::new(7);
759        assert_eq!(F::GENERATOR, expected_multiplicative_group_generator);
760        assert_eq!(F::GENERATOR.as_canonical_u64(), 7_u64);
761
762        // Check on `reduce_u128`
763        let x = u128::MAX;
764        let y = reduce128(x);
765        // The following equality sequence holds, modulo p = 2^64 - 2^32 + 1
766        // 2^128 - 1 = (2^64 - 1) * (2^64 + 1)
767        //           = (2^32 - 1 - 1) * (2^32 - 1 + 1)
768        //           = (2^32 - 2) * (2^32)
769        //           = 2^64 - 2 * 2^32
770        //           = 2^64 - 2^33
771        //           = 2^32 - 1 - 2^33
772        //           = - 2^32 - 1
773        let expected_result = -F::TWO.exp_power_of_2(5) - F::ONE;
774        assert_eq!(y, expected_result);
775
776        let f = F::new(100);
777        assert_eq!(f.injective_exp_n().injective_exp_root_n(), f);
778        assert_eq!(y.injective_exp_n().injective_exp_root_n(), y);
779        assert_eq!(F::TWO.injective_exp_n().injective_exp_root_n(), F::TWO);
780    }
781
782    // Goldilocks has a redundant representation for both 0 and 1.
783    const ZEROS: [Goldilocks; 2] = [Goldilocks::ZERO, Goldilocks::new(P)];
784    const ONES: [Goldilocks; 2] = [Goldilocks::ONE, Goldilocks::new(P + 1)];
785
786    // Get the prime factorization of the order of the multiplicative group.
787    // i.e. the prime factorization of P - 1.
788    fn multiplicative_group_prime_factorization() -> [(BigUint, u32); 6] {
789        [
790            (BigUint::from(2u8), 32),
791            (BigUint::from(3u8), 1),
792            (BigUint::from(5u8), 1),
793            (BigUint::from(17u8), 1),
794            (BigUint::from(257u16), 1),
795            (BigUint::from(65537u32), 1),
796        ]
797    }
798
799    test_field!(
800        crate::Goldilocks,
801        &super::ZEROS,
802        &super::ONES,
803        &super::multiplicative_group_prime_factorization()
804    );
805    test_prime_field!(crate::Goldilocks);
806    test_prime_field_64!(crate::Goldilocks, &super::ZEROS, &super::ONES);
807    test_two_adic_field!(crate::Goldilocks);
808
809    test_field_dft!(
810        radix2dit,
811        crate::Goldilocks,
812        super::EF,
813        p3_dft::Radix2Dit<_>
814    );
815    test_field_dft!(bowers, crate::Goldilocks, super::EF, p3_dft::Radix2Bowers);
816    test_field_dft!(
817        parallel,
818        crate::Goldilocks,
819        super::EF,
820        p3_dft::Radix2DitParallel<crate::Goldilocks>
821    );
822}