Skip to main content

p3_goldilocks/
goldilocks.rs

1use alloc::vec;
2use alloc::vec::Vec;
3use core::fmt::{Debug, Display, Formatter};
4use core::hash::{Hash, Hasher};
5use core::iter::{Product, Sum};
6use core::ops::{Add, AddAssign, Div, DivAssign, Mul, MulAssign, Neg, Sub, SubAssign};
7use core::{array, fmt};
8
9use num_bigint::BigUint;
10use p3_challenger::UniformSamplingField;
11use p3_field::exponentiation::exp_10540996611094048183;
12use p3_field::integers::QuotientMap;
13use p3_field::op_assign_macros::{
14    impl_add_assign, impl_div_methods, impl_mul_methods, impl_sub_assign,
15};
16use p3_field::{
17    Field, InjectiveMonomial, Packable, PermutationMonomial, PrimeCharacteristicRing, PrimeField,
18    PrimeField64, RawDataSerializable, TwoAdicField, halve_u64, impl_raw_serializable_primefield64,
19    quotient_map_large_iint, quotient_map_large_uint, quotient_map_small_int,
20};
21use p3_util::{assume, branch_hint, flatten_to_base, gcd_inner};
22use rand::Rng;
23use rand::distr::{Distribution, StandardUniform};
24use serde::{Deserialize, Serialize};
25
26/// The Goldilocks prime
27pub(crate) const P: u64 = 0xFFFF_FFFF_0000_0001;
28
29/// The prime field known as Goldilocks, defined as `F_p` where `p = 2^64 - 2^32 + 1`.
30///
31/// Note that the safety of deriving `Serialize` and `Deserialize` relies on the fact that the internal value can be any u64.
32#[derive(Copy, Clone, Default, Serialize, Deserialize)]
33#[repr(transparent)] // Important for reasoning about memory layout
34#[must_use]
35pub struct Goldilocks {
36    /// Not necessarily canonical.
37    pub(crate) value: u64,
38}
39
40impl Goldilocks {
41    /// Create a new field element from any `u64`.
42    ///
43    /// Any `u64` value is accepted. No reduction is performed since
44    /// Goldilocks uses a non-canonical internal representation.
45    #[inline]
46    pub const fn new(value: u64) -> Self {
47        Self { value }
48    }
49
50    /// Convert a `[u64; N]` array to an array of field elements.
51    ///
52    /// Const version of `input.map(Goldilocks::new)`.
53    #[inline]
54    pub const fn new_array<const N: usize>(input: [u64; N]) -> [Self; N] {
55        let mut output = [Self::ZERO; N];
56        let mut i = 0;
57        while i < N {
58            output[i].value = input[i];
59            i += 1;
60        }
61        output
62    }
63
64    /// Convert a `[[u64; N]; M]` array to a 2D array of field elements.
65    ///
66    /// Const version of `input.map(Goldilocks::new_array)`.
67    #[inline]
68    pub const fn new_2d_array<const N: usize, const M: usize>(
69        input: [[u64; N]; M],
70    ) -> [[Self; N]; M] {
71        let mut output = [[Self::ZERO; N]; M];
72        let mut i = 0;
73        while i < M {
74            output[i] = Self::new_array(input[i]);
75            i += 1;
76        }
77        output
78    }
79
80    /// Two's complement of `ORDER`, i.e. `2^64 - ORDER = 2^32 - 1`.
81    const NEG_ORDER: u64 = Self::ORDER_U64.wrapping_neg();
82
83    /// A list of generators for the two-adic subgroups of the goldilocks field.
84    ///
85    /// These satisfy the properties that `TWO_ADIC_GENERATORS[0] = 1` and `TWO_ADIC_GENERATORS[i+1]^2 = TWO_ADIC_GENERATORS[i]`.
86    pub const TWO_ADIC_GENERATORS: [Self; 33] = Self::new_array([
87        0x0000000000000001,
88        0xffffffff00000000,
89        0x0001000000000000,
90        0xfffffffeff000001,
91        0xefffffff00000001,
92        0x00003fffffffc000,
93        0x0000008000000000,
94        0xf80007ff08000001,
95        0xbf79143ce60ca966,
96        0x1905d02a5c411f4e,
97        0x9d8f2ad78bfed972,
98        0x0653b4801da1c8cf,
99        0xf2c35199959dfcb6,
100        0x1544ef2335d17997,
101        0xe0ee099310bba1e2,
102        0xf6b2cffe2306baac,
103        0x54df9630bf79450e,
104        0xabd0a6e8aa3d8a0e,
105        0x81281a7b05f9beac,
106        0xfbd41c6b8caa3302,
107        0x30ba2ecd5e93e76d,
108        0xf502aef532322654,
109        0x4b2a18ade67246b5,
110        0xea9d5a1336fbc98b,
111        0x86cdcc31c307e171,
112        0x4bbaf5976ecfefd8,
113        0xed41d05b78d6e286,
114        0x10d78dd8915a171d,
115        0x59049500004a4485,
116        0xdfa8c93ba46d2666,
117        0x7e9bd009b86a0845,
118        0x400a7f755588e659,
119        0x185629dcda58878c,
120    ]);
121
122    /// A list of powers of two from 0 to 95.
123    ///
124    /// Note that 2^{96} = -1 mod P so all powers of two can be simply
125    /// derived from this list.
126    const POWERS_OF_TWO: [Self; 96] = {
127        let mut powers_of_two = [Self::ONE; 96];
128
129        let mut i = 1;
130        while i < 64 {
131            powers_of_two[i] = Self::new(1 << i);
132            i += 1;
133        }
134        let mut var = Self::new(1 << 63);
135        while i < 96 {
136            var = const_add(var, var);
137            powers_of_two[i] = var;
138            i += 1;
139        }
140        powers_of_two
141    };
142}
143
144impl PartialEq for Goldilocks {
145    fn eq(&self, other: &Self) -> bool {
146        self.as_canonical_u64() == other.as_canonical_u64()
147    }
148}
149
150impl Eq for Goldilocks {}
151
152impl Packable for Goldilocks {}
153
154impl Hash for Goldilocks {
155    fn hash<H: Hasher>(&self, state: &mut H) {
156        state.write_u64(self.as_canonical_u64());
157    }
158}
159
160impl Ord for Goldilocks {
161    fn cmp(&self, other: &Self) -> core::cmp::Ordering {
162        self.as_canonical_u64().cmp(&other.as_canonical_u64())
163    }
164}
165
166impl PartialOrd for Goldilocks {
167    fn partial_cmp(&self, other: &Self) -> Option<core::cmp::Ordering> {
168        Some(self.cmp(other))
169    }
170}
171
172impl Display for Goldilocks {
173    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
174        Display::fmt(&self.as_canonical_u64(), f)
175    }
176}
177
178impl Debug for Goldilocks {
179    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
180        Debug::fmt(&self.as_canonical_u64(), f)
181    }
182}
183
184impl Distribution<Goldilocks> for StandardUniform {
185    fn sample<R: Rng + ?Sized>(&self, rng: &mut R) -> Goldilocks {
186        loop {
187            let next_u64 = rng.next_u64();
188            let is_canonical = next_u64 < Goldilocks::ORDER_U64;
189            if is_canonical {
190                return Goldilocks::new(next_u64);
191            }
192        }
193    }
194}
195
196impl UniformSamplingField for Goldilocks {
197    const MAX_SINGLE_SAMPLE_BITS: usize = 24;
198    const SAMPLING_BITS_M: [u64; 64] = {
199        let prime: u64 = P;
200        let mut a = [0u64; 64];
201        let mut k = 0;
202        while k < 64 {
203            if k == 0 {
204                a[k] = prime; // This value is irrelevant in practice. `bits = 0` returns 0 always.
205            } else {
206                // Create a mask to zero out the last k bits
207                let mask = !((1u64 << k) - 1);
208                a[k] = prime & mask;
209            }
210            k += 1;
211        }
212        a
213    };
214}
215
216impl PrimeCharacteristicRing for Goldilocks {
217    type PrimeSubfield = Self;
218
219    const ZERO: Self = Self::new(0);
220    const ONE: Self = Self::new(1);
221    const TWO: Self = Self::new(2);
222    const NEG_ONE: Self = Self::new(Self::ORDER_U64 - 1);
223
224    #[inline]
225    fn from_prime_subfield(f: Self::PrimeSubfield) -> Self {
226        f
227    }
228
229    #[inline]
230    fn from_bool(b: bool) -> Self {
231        Self::new(b.into())
232    }
233
234    #[inline]
235    fn halve(&self) -> Self {
236        Self::new(halve_u64::<P>(self.value))
237    }
238
239    #[inline]
240    fn mul_2exp_u64(&self, exp: u64) -> Self {
241        // In the Goldilocks field, 2^96 = -1 mod P and 2^192 = 1 mod P.
242        if exp < 96 {
243            *self * Self::POWERS_OF_TWO[exp as usize]
244        } else if exp < 192 {
245            -*self * Self::POWERS_OF_TWO[(exp - 96) as usize]
246        } else {
247            self.mul_2exp_u64(exp % 192)
248        }
249    }
250
251    #[inline]
252    fn div_2exp_u64(&self, mut exp: u64) -> Self {
253        // In the goldilocks field, 2^192 = 1 mod P.
254        // Thus 2^{-n} = 2^{192 - n} mod P.
255        exp %= 192;
256        self.mul_2exp_u64(192 - exp)
257    }
258
259    #[inline]
260    fn sum_array<const N: usize>(input: &[Self]) -> Self {
261        assert_eq!(N, input.len());
262        // Benchmarking shows that for N <= 3 it's faster to sum the elements directly
263        // but for N > 3 it's faster to use the .sum() methods which passes through u128's
264        // allowing for delayed reductions.
265        match N {
266            0 => Self::ZERO,
267            1 => input[0],
268            2 => input[0] + input[1],
269            3 => input[0] + input[1] + input[2],
270            _ => input.iter().copied().sum(),
271        }
272    }
273
274    #[inline]
275    fn dot_product<const N: usize>(lhs: &[Self; N], rhs: &[Self; N]) -> Self {
276        // The constant OFFSET has 2 important properties:
277        // 1. It is a multiple of P.
278        // 2. It is greater than the maximum possible value of the sum of the products of two u64s.
279        const OFFSET: u128 = ((P as u128) << 64) - (P as u128) + ((P as u128) << 32);
280        assert!((N as u32) <= (1 << 31));
281        match N {
282            0 => Self::ZERO,
283            1 => lhs[0] * rhs[0],
284            2 => {
285                // We unroll the N = 2 case as it is slightly faster and this is an important case
286                // as a major use is in extension field arithmetic and Goldilocks has a degree 2 extension.
287                let long_prod_0 = (lhs[0].value as u128) * (rhs[0].value as u128);
288                let long_prod_1 = (lhs[1].value as u128) * (rhs[1].value as u128);
289
290                // We know that long_prod_0, long_prod_1 < OFFSET.
291                // Thus if long_prod_0 + long_prod_1 overflows, we can just subtract OFFSET.
292                let (sum, over) = long_prod_0.overflowing_add(long_prod_1);
293                // Compiler really likes defining sum_corr here instead of in the if/else.
294                let sum_corr = sum.wrapping_sub(OFFSET);
295                if over {
296                    reduce128(sum_corr)
297                } else {
298                    reduce128(sum)
299                }
300            }
301            _ => {
302                let (lo_plus_hi, hi) = lhs
303                    .iter()
304                    .zip(rhs)
305                    .map(|(x, y)| (x.value as u128) * (y.value as u128))
306                    .fold((0_u128, 0_u64), |(acc_lo, acc_hi), val| {
307                        // Split val into (hi, lo) where hi is the upper 32 bits and lo is the lower 96 bits.
308                        let val_hi = (val >> 96) as u64;
309                        // acc_hi accumulates hi, acc_lo accumulates lo + 2^{96}hi.
310                        // As N <= 2^32, acc_hi cannot overflow.
311                        unsafe { (acc_lo.wrapping_add(val), acc_hi.unchecked_add(val_hi)) }
312                    });
313                // First, remove the hi part from lo_plus_hi.
314                let lo = lo_plus_hi.wrapping_sub((hi as u128) << 96);
315                // As 2^{96} = -1 mod P, we simply need to reduce lo - hi.
316                // As N <= 2^31, lo < 2^127 and hi < 2^63 < P. Hence the equation below will not over or underflow.
317                let sum = unsafe { lo.unchecked_add(P.unchecked_sub(hi) as u128) };
318                reduce128(sum)
319            }
320        }
321    }
322
323    #[inline]
324    fn zero_vec(len: usize) -> Vec<Self> {
325        // SAFETY:
326        // Due to `#[repr(transparent)]`, Goldilocks and u64 have the same size, alignment
327        // and memory layout making `flatten_to_base` safe. This this will create
328        // a vector Goldilocks elements with value set to 0.
329        unsafe { flatten_to_base(vec![0u64; len]) }
330    }
331}
332
333/// Degree of the smallest permutation polynomial for Goldilocks.
334///
335/// As p - 1 = 2^32 * 3 * 5 * 17 * ... the smallest choice for a degree D satisfying gcd(p - 1, D) = 1 is 7.
336impl InjectiveMonomial<7> for Goldilocks {}
337
338impl PermutationMonomial<7> for Goldilocks {
339    /// In the field `Goldilocks`, `a^{1/7}` is equal to a^{10540996611094048183}.
340    ///
341    /// This follows from the calculation `7*10540996611094048183 = 4*(2^64 - 2**32) + 1 = 1 mod (p - 1)`.
342    fn injective_exp_root_n(&self) -> Self {
343        exp_10540996611094048183(*self)
344    }
345}
346
347impl RawDataSerializable for Goldilocks {
348    impl_raw_serializable_primefield64!();
349}
350
351impl Field for Goldilocks {
352    #[cfg(all(
353        target_arch = "x86_64",
354        target_feature = "avx2",
355        not(target_feature = "avx512f")
356    ))]
357    type Packing = crate::PackedGoldilocksAVX2;
358
359    #[cfg(all(target_arch = "x86_64", target_feature = "avx512f"))]
360    type Packing = crate::PackedGoldilocksAVX512;
361
362    #[cfg(target_arch = "aarch64")]
363    type Packing = crate::PackedGoldilocksNeon;
364
365    #[cfg(not(any(
366        all(
367            target_arch = "x86_64",
368            target_feature = "avx2",
369            not(target_feature = "avx512f")
370        ),
371        all(target_arch = "x86_64", target_feature = "avx512f"),
372        target_arch = "aarch64",
373    )))]
374    type Packing = Self;
375
376    // Sage: GF(2^64 - 2^32 + 1).multiplicative_generator()
377    const GENERATOR: Self = Self::new(7);
378
379    fn is_zero(&self) -> bool {
380        self.value == 0 || self.value == Self::ORDER_U64
381    }
382
383    fn try_inverse(&self) -> Option<Self> {
384        if self.is_zero() {
385            return None;
386        }
387
388        Some(gcd_inversion(*self))
389    }
390
391    #[inline]
392    fn order() -> BigUint {
393        P.into()
394    }
395}
396
397// We use macros to implement QuotientMap<Int> for all integer types except for u64 and i64.
398quotient_map_small_int!(Goldilocks, u64, [u8, u16, u32]);
399quotient_map_small_int!(Goldilocks, i64, [i8, i16, i32]);
400quotient_map_large_uint!(
401    Goldilocks,
402    u64,
403    Goldilocks::ORDER_U64,
404    "`[0, 2^64 - 2^32]`",
405    "`[0, 2^64 - 1]`",
406    [u128]
407);
408quotient_map_large_iint!(
409    Goldilocks,
410    i64,
411    "`[-(2^63 - 2^31), 2^63 - 2^31]`",
412    "`[1 + 2^32 - 2^64, 2^64 - 1]`",
413    [(i128, u128)]
414);
415
416impl QuotientMap<u64> for Goldilocks {
417    /// Convert a given `u64` integer into an element of the `Goldilocks` field.
418    ///
419    /// No reduction is needed as the internal value is allowed
420    /// to be any u64.
421    #[inline]
422    fn from_int(int: u64) -> Self {
423        Self::new(int)
424    }
425
426    /// Convert a given `u64` integer into an element of the `Goldilocks` field.
427    ///
428    /// Return `None` if the given integer is greater than `p = 2^64 - 2^32 + 1`.
429    #[inline]
430    fn from_canonical_checked(int: u64) -> Option<Self> {
431        (int < Self::ORDER_U64).then(|| Self::new(int))
432    }
433
434    /// Convert a given `u64` integer into an element of the `Goldilocks` field.
435    ///
436    /// # Safety
437    /// In this case this function is actually always safe as the internal
438    /// value is allowed to be any u64.
439    #[inline(always)]
440    unsafe fn from_canonical_unchecked(int: u64) -> Self {
441        Self::new(int)
442    }
443}
444
445impl QuotientMap<i64> for Goldilocks {
446    /// Convert a given `i64` integer into an element of the `Goldilocks` field.
447    ///
448    /// We simply need to deal with the sign.
449    #[inline]
450    fn from_int(int: i64) -> Self {
451        if int >= 0 {
452            Self::new(int as u64)
453        } else {
454            Self::new(Self::ORDER_U64.wrapping_add_signed(int))
455        }
456    }
457
458    /// Convert a given `i64` integer into an element of the `Goldilocks` field.
459    ///
460    /// Returns none if the input does not lie in the range `(-(2^63 - 2^31), 2^63 - 2^31)`.
461    #[inline]
462    fn from_canonical_checked(int: i64) -> Option<Self> {
463        const POS_BOUND: i64 = (P >> 1) as i64;
464        const NEG_BOUND: i64 = -POS_BOUND;
465        match int {
466            0..=POS_BOUND => Some(Self::new(int as u64)),
467            NEG_BOUND..0 => Some(Self::new(Self::ORDER_U64.wrapping_add_signed(int))),
468            _ => None,
469        }
470    }
471
472    /// Convert a given `i64` integer into an element of the `Goldilocks` field.
473    ///
474    /// # Safety
475    /// In this case this function is actually always safe as the internal
476    /// value is allowed to be any u64.
477    #[inline(always)]
478    unsafe fn from_canonical_unchecked(int: i64) -> Self {
479        Self::from_int(int)
480    }
481}
482
483impl PrimeField for Goldilocks {
484    fn as_canonical_biguint(&self) -> BigUint {
485        self.as_canonical_u64().into()
486    }
487}
488
489impl PrimeField64 for Goldilocks {
490    const ORDER_U64: u64 = P;
491
492    #[inline]
493    fn as_canonical_u64(&self) -> u64 {
494        let mut c = self.value;
495        // We only need one condition subtraction, since 2 * ORDER would not fit in a u64.
496        if c >= Self::ORDER_U64 {
497            c -= Self::ORDER_U64;
498        }
499        c
500    }
501}
502
503impl TwoAdicField for Goldilocks {
504    const TWO_ADICITY: usize = 32;
505
506    fn two_adic_generator(bits: usize) -> Self {
507        assert!(bits <= Self::TWO_ADICITY);
508        Self::TWO_ADIC_GENERATORS[bits]
509    }
510}
511
512/// A const version of the addition function.
513///
514/// Useful for constructing constants values in const contexts. Outside of
515/// const contexts, Add should be used instead.
516#[inline]
517const fn const_add(lhs: Goldilocks, rhs: Goldilocks) -> Goldilocks {
518    let (sum, over) = lhs.value.overflowing_add(rhs.value);
519    let (mut sum, over) = sum.overflowing_add((over as u64) * Goldilocks::NEG_ORDER);
520    if over {
521        sum += Goldilocks::NEG_ORDER;
522    }
523    Goldilocks::new(sum)
524}
525
526impl Add for Goldilocks {
527    type Output = Self;
528
529    #[inline]
530    fn add(self, rhs: Self) -> Self {
531        let (sum, over) = self.value.overflowing_add(rhs.value);
532        let (mut sum, over) = sum.overflowing_add(u64::from(over) * Self::NEG_ORDER);
533        if over {
534            // NB: self.value > Self::ORDER && rhs.value > Self::ORDER is necessary but not
535            // sufficient for double-overflow.
536            // This assume does two things:
537            //  1. If compiler knows that either self.value or rhs.value <= ORDER, then it can skip
538            //     this check.
539            //  2. Hints to the compiler how rare this double-overflow is (thus handled better with
540            //     a branch).
541            unsafe {
542                assume(self.value > Self::ORDER_U64 && rhs.value > Self::ORDER_U64);
543            }
544            branch_hint();
545            sum += Self::NEG_ORDER; // Cannot overflow.
546        }
547        Self::new(sum)
548    }
549}
550
551impl Sub for Goldilocks {
552    type Output = Self;
553
554    #[inline]
555    fn sub(self, rhs: Self) -> Self {
556        let (diff, under) = self.value.overflowing_sub(rhs.value);
557        let (mut diff, under) = diff.overflowing_sub(u64::from(under) * Self::NEG_ORDER);
558        if under {
559            // NB: self.value < NEG_ORDER - 1 && rhs.value > ORDER is necessary but not
560            // sufficient for double-underflow.
561            // This assume does two things:
562            //  1. If compiler knows that either self.value >= NEG_ORDER - 1 or rhs.value <= ORDER,
563            //     then it can skip this check.
564            //  2. Hints to the compiler how rare this double-underflow is (thus handled better
565            //     with a branch).
566            unsafe {
567                assume(self.value < Self::NEG_ORDER - 1 && rhs.value > Self::ORDER_U64);
568            }
569            branch_hint();
570            diff -= Self::NEG_ORDER; // Cannot underflow.
571        }
572        Self::new(diff)
573    }
574}
575
576impl Neg for Goldilocks {
577    type Output = Self;
578
579    #[inline]
580    fn neg(self) -> Self::Output {
581        Self::new(Self::ORDER_U64 - self.as_canonical_u64())
582    }
583}
584
585impl Mul for Goldilocks {
586    type Output = Self;
587
588    #[inline]
589    fn mul(self, rhs: Self) -> Self {
590        reduce128(u128::from(self.value) * u128::from(rhs.value))
591    }
592}
593
594impl_add_assign!(Goldilocks);
595impl_sub_assign!(Goldilocks);
596impl_mul_methods!(Goldilocks);
597impl_div_methods!(Goldilocks, Goldilocks);
598
599impl Sum for Goldilocks {
600    fn sum<I: Iterator<Item = Self>>(iter: I) -> Self {
601        // This is faster than iter.reduce(|x, y| x + y).unwrap_or(Self::ZERO) for iterators of length > 2.
602
603        // This sum will not overflow so long as iter.len() < 2^64.
604        let sum = iter.map(|x| x.value as u128).sum::<u128>();
605        reduce128(sum)
606    }
607}
608
609/// Reduces to a 64-bit value. The result might not be in canonical form; it could be in between the
610/// field order and `2^64`.
611#[inline]
612pub(crate) fn reduce128(x: u128) -> Goldilocks {
613    let (x_lo, x_hi) = split(x); // This is a no-op
614    let x_hi_hi = x_hi >> 32;
615    let x_hi_lo = x_hi & Goldilocks::NEG_ORDER;
616
617    let (mut t0, borrow) = x_lo.overflowing_sub(x_hi_hi);
618    if borrow {
619        branch_hint(); // A borrow is exceedingly rare. It is faster to branch.
620        t0 -= Goldilocks::NEG_ORDER; // Cannot underflow.
621    }
622    let t1 = x_hi_lo * Goldilocks::NEG_ORDER;
623    let t2 = unsafe { add_no_canonicalize_trashing_input(t0, t1) };
624    Goldilocks::new(t2)
625}
626
627#[inline]
628#[allow(clippy::cast_possible_truncation)]
629const fn split(x: u128) -> (u64, u64) {
630    (x as u64, (x >> 64) as u64)
631}
632
633/// Fast addition modulo ORDER for x86-64.
634/// This function is marked unsafe for the following reasons:
635///   - It is only correct if x + y < 2**64 + ORDER = 0x1ffffffff00000001.
636///   - It is only faster in some circumstances. In particular, on x86 it overwrites both inputs in
637///     the registers, so its use is not recommended when either input will be used again.
638#[inline(always)]
639#[cfg(target_arch = "x86_64")]
640unsafe fn add_no_canonicalize_trashing_input(x: u64, y: u64) -> u64 {
641    unsafe {
642        let res_wrapped: u64;
643        let adjustment: u64;
644        core::arch::asm!(
645            "add {0}, {1}",
646            // Trick. The carry flag is set iff the addition overflowed.
647            // sbb x, y does x := x - y - CF. In our case, x and y are both {1:e}, so it simply does
648            // {1:e} := 0xffffffff on overflow and {1:e} := 0 otherwise. {1:e} is the low 32 bits of
649            // {1}; the high 32-bits are zeroed on write. In the end, we end up with 0xffffffff in {1}
650            // on overflow; this happens be NEG_ORDER.
651            // Note that the CPU does not realize that the result of sbb x, x does not actually depend
652            // on x. We must write the result to a register that we know to be ready. We have a
653            // dependency on {1} anyway, so let's use it.
654            "sbb {1:e}, {1:e}",
655            inlateout(reg) x => res_wrapped,
656            inlateout(reg) y => adjustment,
657            options(pure, nomem, nostack),
658        );
659        assume(x != 0 || (res_wrapped == y && adjustment == 0));
660        assume(y != 0 || (res_wrapped == x && adjustment == 0));
661        // Add NEG_ORDER == subtract ORDER.
662        // Cannot overflow unless the assumption if x + y < 2**64 + ORDER is incorrect.
663        res_wrapped + adjustment
664    }
665}
666
667#[inline(always)]
668#[cfg(not(target_arch = "x86_64"))]
669unsafe fn add_no_canonicalize_trashing_input(x: u64, y: u64) -> u64 {
670    let (res_wrapped, carry) = x.overflowing_add(y);
671    // Below cannot overflow unless the assumption if x + y < 2**64 + ORDER is incorrect.
672    res_wrapped + Goldilocks::NEG_ORDER * u64::from(carry)
673}
674
675/// Compute the inverse of a Goldilocks element `a` using the binary GCD algorithm.
676///
677/// Instead of applying the standard algorithm this uses a variant inspired by https://eprint.iacr.org/2020/972.pdf.
678/// The key idea is to compute update factors which are incorrect by a known power of 2 which
679/// can be corrected at the end. These update factors can then be used to construct the inverse
680/// via a simple linear combination.
681///
682/// This is much faster than the standard algorithm as we avoid most of the (more expensive) field arithmetic.
683fn gcd_inversion(input: Goldilocks) -> Goldilocks {
684    // Initialise our values to the value we want to invert and the prime.
685    let (mut a, mut b) = (input.value, P);
686
687    // As the goldilocks prime is 64 bit, initially `len(a) + len(b) ≤ 2 * 64 = 128`.
688    // This means we will need `126` iterations of the inner loop ensure `len(a) + len(b) ≤ 2`.
689    // We split the iterations into 2 rounds of length 63.
690    const ROUND_SIZE: usize = 63;
691
692    // In theory we could make this slightly faster by replacing the first `gcd_inner` by a copy-pasted
693    // version which doesn't do any computations involving g. But either the compiler works this out
694    // for itself or the speed up is negligible as I couldn't notice any difference in benchmarks.
695    let (f00, _, f10, _) = gcd_inner::<ROUND_SIZE>(&mut a, &mut b);
696    let (_, _, f11, g11) = gcd_inner::<ROUND_SIZE>(&mut a, &mut b);
697
698    // The update factors are i64's except we need to interpret -2^63 as 2^63.
699    // This is because the outputs of `gcd_inner` are always in the range `(-2^ROUND_SIZE, 2^ROUND_SIZE]`.
700    let u = from_unusual_int(f00);
701    let v = from_unusual_int(f10);
702    let u_fac11 = from_unusual_int(f11);
703    let v_fac11 = from_unusual_int(g11);
704
705    // Each iteration introduced a factor of 2 and so we need to divide by 2^{126}.
706    // But 2^{192} = 1 mod P, so we can instead multiply by 2^{66} as 192 - 126 = 66.
707    (u * u_fac11 + v * v_fac11).mul_2exp_u64(66)
708}
709
710/// Convert from an i64 to a Goldilocks element but interpret -2^63 as 2^63.
711const fn from_unusual_int(int: i64) -> Goldilocks {
712    if (int >= 0) || (int == i64::MIN) {
713        Goldilocks::new(int as u64)
714    } else {
715        Goldilocks::new(Goldilocks::ORDER_U64.wrapping_add_signed(int))
716    }
717}
718
719#[cfg(test)]
720mod tests {
721    use p3_field::extension::BinomialExtensionField;
722    use p3_field_testing::{
723        test_field, test_field_dft, test_prime_field, test_prime_field_64, test_two_adic_field,
724    };
725
726    use super::*;
727
728    type F = Goldilocks;
729    type EF = BinomialExtensionField<F, 5>;
730
731    #[test]
732    fn test_goldilocks() {
733        let f = F::new(100);
734        assert_eq!(f.as_canonical_u64(), 100);
735
736        // Over the Goldilocks field, the following set of equations hold
737        // p               = 0
738        // 2^64 - 2^32 + 1 = 0
739        // 2^64            = 2^32 - 1
740        let f = F::new(u64::MAX);
741        assert_eq!(f.as_canonical_u64(), u32::MAX as u64 - 1);
742
743        let f = F::from_u64(u64::MAX);
744        assert_eq!(f.as_canonical_u64(), u32::MAX as u64 - 1);
745
746        // Generator check
747        let expected_multiplicative_group_generator = F::new(7);
748        assert_eq!(F::GENERATOR, expected_multiplicative_group_generator);
749        assert_eq!(F::GENERATOR.as_canonical_u64(), 7_u64);
750
751        // Check on `reduce_u128`
752        let x = u128::MAX;
753        let y = reduce128(x);
754        // The following equality sequence holds, modulo p = 2^64 - 2^32 + 1
755        // 2^128 - 1 = (2^64 - 1) * (2^64 + 1)
756        //           = (2^32 - 1 - 1) * (2^32 - 1 + 1)
757        //           = (2^32 - 2) * (2^32)
758        //           = 2^64 - 2 * 2^32
759        //           = 2^64 - 2^33
760        //           = 2^32 - 1 - 2^33
761        //           = - 2^32 - 1
762        let expected_result = -F::TWO.exp_power_of_2(5) - F::ONE;
763        assert_eq!(y, expected_result);
764
765        let f = F::new(100);
766        assert_eq!(f.injective_exp_n().injective_exp_root_n(), f);
767        assert_eq!(y.injective_exp_n().injective_exp_root_n(), y);
768        assert_eq!(F::TWO.injective_exp_n().injective_exp_root_n(), F::TWO);
769    }
770
771    // Goldilocks has a redundant representation for both 0 and 1.
772    const ZEROS: [Goldilocks; 2] = [Goldilocks::ZERO, Goldilocks::new(P)];
773    const ONES: [Goldilocks; 2] = [Goldilocks::ONE, Goldilocks::new(P + 1)];
774
775    // Get the prime factorization of the order of the multiplicative group.
776    // i.e. the prime factorization of P - 1.
777    fn multiplicative_group_prime_factorization() -> [(BigUint, u32); 6] {
778        [
779            (BigUint::from(2u8), 32),
780            (BigUint::from(3u8), 1),
781            (BigUint::from(5u8), 1),
782            (BigUint::from(17u8), 1),
783            (BigUint::from(257u16), 1),
784            (BigUint::from(65537u32), 1),
785        ]
786    }
787
788    test_field!(
789        crate::Goldilocks,
790        &super::ZEROS,
791        &super::ONES,
792        &super::multiplicative_group_prime_factorization()
793    );
794    test_prime_field!(crate::Goldilocks);
795    test_prime_field_64!(crate::Goldilocks, &super::ZEROS, &super::ONES);
796    test_two_adic_field!(crate::Goldilocks);
797
798    test_field_dft!(
799        radix2dit,
800        crate::Goldilocks,
801        super::EF,
802        p3_dft::Radix2Dit<_>
803    );
804    test_field_dft!(bowers, crate::Goldilocks, super::EF, p3_dft::Radix2Bowers);
805    test_field_dft!(
806        parallel,
807        crate::Goldilocks,
808        super::EF,
809        p3_dft::Radix2DitParallel<crate::Goldilocks>
810    );
811}