miden-gpu 0.6.0

#ifndef felt_u64_h
#define felt_u64_h

#include "u128.h.metal"

// Fields that use prime 18446744069414584321
namespace p18446744069414584321
{

    // Prime field
    class Fp
    {
    public:
        Fp() = default;
        constexpr Fp(unsigned long v) : inner(v) {}

        constexpr Fp operator+(const Fp rhs) const
        {
            return Fp(add(inner, rhs.inner));
        }

        constexpr Fp operator-(const Fp rhs) const
        {
            return Fp(sub(inner, rhs.inner));
        }

        constexpr explicit operator unsigned long() const
        {
            return inner;
        }

        Fp operator*(const Fp rhs) const
        {
            return Fp(mul(inner, rhs.inner));
        }

        // used for S-box in the Rescue Prime Optimized hash function
        Fp pow7() {
            unsigned long t2 = mul(inner, inner);
            return mul(mul(mul(t2, t2), t2), inner);
        }

        // used for S-box-inverse in the Rescue Prime Optimized hash function
        Fp pow10540996611094048183() {
            // ADDCHAIN IMPLEMENTATION
            // // used addchain
            // // https://github.com/mmcloughlin/addchain
            // unsigned long _100 = sqn<2>(inner);
            // unsigned long _100000 = sqn<3>(_100);
            // unsigned long _100001 = mul(_100000, inner);
            // unsigned long i9 = sqn<3>(_100000);
            // unsigned long i10 = mul(_100001, i9);
            // unsigned long i14 = mul(sqn<3>(i9), i10);
            // unsigned long i17 = mul(sqn<1>(mul(_100, i14)), inner);
            // unsigned long i18 = mul(i14, i17);
            // unsigned long i19 = mul(i17, i18);
            // unsigned long i20 = mul(i18, i19);
            // unsigned long i21 = mul(inner, i20);
            // unsigned long i23 = mul(sqn<1>(i21), inner);
            // unsigned long i60 = mul(mul(sqn<17>(mul(sqn<17>(i23), i20)), i23), i21);
            // return mul(sqn<14>(i60), i19);

            // FROM MIDEN IMPLEMENTATION
            // compute base^10540996611094048183 using 72 multiplications per array element
            // 10540996611094048183 = b1001001001001001001001001001000110110110110110110110110110110111
            unsigned long t1 = mul(inner, inner);
            unsigned long t2 = mul(t1, t1);
            unsigned long t3 = exp_acc<3>(t2, t2);
            unsigned long t4 = exp_acc<6>(t3, t3);
            unsigned long t5 = exp_acc<12>(t4, t4);
            unsigned long t6 = exp_acc<6>(t5, t3);
            unsigned long t7 = exp_acc<31>(t6, t6);
            unsigned long t8 = sqn<2>(mul(mul(t7, t7), t6));
            return mul(mul(mul(t1, t2), inner), t8);
        }

        Fp pow(unsigned exp)
        {
            // TODO: consider removing.
            // if (exp == 1) {
            //     return *this;
            // }

            Fp res = ONE;

            while (exp > 0)
            {
                if (exp & 1)
                {
                    res = res * *this;
                }
                exp >>= 1;
                *this = *this * *this;
            }

            return res;
        }

        Fp inverse() 
        {
            unsigned long t2 = exp_acc<1>(inner, inner);
            unsigned long t3 = exp_acc<1>(t2, inner);
            unsigned long t6 = exp_acc<3>(t3, t3);
            unsigned long t12 = exp_acc<6>(t6, t6);
            unsigned long t24 = exp_acc<12>(t12, t12);
            unsigned long t30 = exp_acc<6>(t24, t6);
            unsigned long t31 = exp_acc<1>(t30, inner);
            unsigned long t63 = exp_acc<32>(t31, t31);
            unsigned long inv = exp_acc<1>(t63, inner);
            return Fp(inv);
        }

        Fp neg()
        {
            // TODO: can improve
            return Fp(sub(0, inner));
        }

        Fp double_fp()
        {
            u128 doubled = u128(0, inner) << 1;

            unsigned long lowPart = doubled.low;
            unsigned long highPart = doubled.high;

            unsigned long reduced = lowPart;
            if (highPart > 0) {
                reduced = (reduced - (N * highPart));
            }

            return Fp(reduced);
        }
        
        // 1 in Montgomery representation
        constexpr static const constant unsigned long ONE = 4294967295;

    private:
        unsigned long inner;

        // Field modulus `p = 2^64 - 2^32 + 1`
        constexpr static const constant unsigned long N = 18446744069414584321;

        // Square of auxiliary modulus R for Montgomery reduction `R2 ≡ (2^64)^2 mod p`
        constexpr static const constant unsigned long R2 = 18446744065119617025;

        template<unsigned N_ACC>
        inline unsigned long exp_acc(unsigned long element, const unsigned long tail) const {
#pragma unroll
            for (unsigned i = 0; i < N_ACC; i++) {
                element = mul(element, element);
            }
            return mul(element, tail);
        }

        template<unsigned N_ACC>
        inline unsigned long sqn(unsigned long element) const {
#pragma unroll
            for (unsigned i = 0; i < N_ACC; i++) {
                element = mul(element, element);
            }
            return element;
        }

        inline unsigned long add(const unsigned long a, const unsigned long b) const
        {
            // We compute a + b = a - (p - b).
            unsigned long tmp = N - b;
            unsigned underflow = a < tmp;
            unsigned long x1 = a - tmp;
            unsigned adj = -underflow;
            return x1 - adj;
        }

        inline unsigned long sub(const unsigned long a, const unsigned long b) const
        {
            unsigned underflow = a < b;
            unsigned long x1 = a - b;
            unsigned adj = -underflow;
            return x1 - adj;
        }

        inline unsigned long mul(const unsigned long lhs, const unsigned long rhs) const
        {
            unsigned long xl = lhs * rhs;
            unsigned long xh = metal::mulhi(lhs, rhs);
            unsigned long tmp = xl << 32;
            unsigned a_overflow = xl > (0xFFFFFFFFFFFFFFFF - tmp);
            unsigned long a = xl + tmp;
            unsigned long b = a - (a >> 32) - a_overflow;
            unsigned r_underflow = xh < b;
            unsigned long r = xh - b;
            unsigned adj = -r_underflow;
            return r - adj;
        }
    };

    // Used in Fq3 as a return type for decompose()
    struct Fq3Decomposed {
        Fp c0;
        Fp c1;
        Fp c2;
    };

    // Cubig extension field over irreducible polynomial x^3 - 2
    // adapted from arkworks
    // TODO: make into generic cubic extension class
    class Fq3
    {
    public:
        Fq3() = default;
        constexpr Fq3(Fp c) : c0(c), c1(Fp(0)), c2(Fp(0)) {}
        constexpr Fq3(Fp c0, Fp c1, Fp c2) : c0(c0), c1(c1), c2(c2) {}

        constexpr Fq3 operator+(const Fp rhs) const
        {
            return Fq3(c0 + rhs, c1, c2);
        }

        constexpr Fq3 operator+(const Fq3 rhs) const
        {
            return Fq3(c0 + rhs.c0, c1 + rhs.c1, c2 + rhs.c2);
        }

        constexpr Fq3 operator-(const Fq3 rhs) const
        {
            return Fq3(c0 - rhs.c0, c1 - rhs.c1, c2 - rhs.c2);
        }

        // TODO: With this multiplication, the result differs from cpu rpx implementation.
        Fq3 operator*(const Fq3 rhs) const 
        {
            // Devegili OhEig Scott Dahab --- Multiplication and Squaring on
            // AbstractPairing-Friendly
            // Fields.pdf; Section 4 (Karatsuba)
            Fp a = rhs.c0;
            Fp b = rhs.c1;
            Fp c = rhs.c2;

            Fp d = c0;
            Fp e = c1;
            Fp f = c2;

            Fp ad = d * a;
            Fp be = e * b;
            Fp cf = f * c;

            Fp x = (e + f) * (b + c) - be - cf;
            Fp y = (d + e) * (a + b) - ad - be;
            Fp z = (d + f) * (a + c) - ad + be - cf;

            return Fq3(
                /* =c0 */ ad + x * Fp(NONREDIDUE), 
                /* =c1 */ y + cf * Fp(NONREDIDUE),
                /* =c2 */ z
            );
        }

        // Taken from Winterfell's implementation
        Fq3 mul(const Fq3 b) const {
             Fp a0b0 = c0 * b.c0;
             Fp a1b1 = c1 * b.c1;
             Fp a2b2 = c2 * b.c2;

             Fp a0_plus_a1 = c0 + c1;
             Fp a0_plus_a2 = c0 + c2;
             Fp a1_plus_a2 = c1 + c2;

             Fp b0_plus_b1 = b.c0 + b.c1;
             Fp b0_plus_b2 = b.c0 + b.c2;
             Fp b1_plus_b2 = b.c1 + b.c2;

             Fp a0b0_a0b1_a1b0_a1b1 = a0_plus_a1 * b0_plus_b1;
             Fp a0b0_a0b2_a2b0_a2b2 = a0_plus_a2 * b0_plus_b2;
             Fp a1b1_a1b2_a2b1_a2b2 = a1_plus_a2 * b1_plus_b2;

             Fp a0b0_minus_a1b1 = a0b0 - a1b1;

             Fp a0b0_a1b2_a2b1 = a1b1_a1b2_a2b1_a2b2 + a0b0_minus_a1b1 - a2b2;
             Fp a0b1_a1b0_a1b2_a2b1_a2b2 = a0b0_a0b1_a1b0_a1b1 + a1b1_a1b2_a2b1_a2b2 - a1b1.double_fp() - a0b0;
             Fp a0b2_a1b1_a2b0_a2b2 = a0b0_a0b2_a2b0_a2b2 - a0b0_minus_a1b1;

             return Fq3(a0b0_a1b2_a2b1, a0b1_a1b0_a1b2_a2b1_a2b2, a0b2_a1b1_a2b0_a2b2);
        }

        Fq3 operator*(const Fp rhs) const
        {
            return Fq3(c0 * rhs, c1 * rhs, c2 * rhs);
        }

        // TODO: make util function
        Fq3 pow(unsigned exp)
        {
            // TODO: investige. for some reason removing this makes tests fail
            if (exp == 1) {
                return *this;
            }

            Fq3 res = Fq3(Fp(Fp::ONE), Fp(0), Fp(0));

            while (exp > 0) {
                if (exp & 1) {
                    res = res.mul(*this);
                }
                exp >>= 1;
                // TODO: this doesn't need to be done on the last itteration.
                // may already be optimized away by the compiler. Not sure.
                *this = this->mul(*this);
            }

            return res;
        }

        Fq3 neg()
        {
            // TODO: can improve
            return Fq3(Fp(0) - c0, Fp(0) - c1, Fp(0) - c2);
        }

        // Fq3 inverse() 
        // {
        //     // TODO
        // }

        // Return inner Fp values are array of 3 elements
        inline Fq3Decomposed decompose() const
        {
            return Fq3Decomposed {
                .c0 = c0,
                .c1 = c1,
                .c2 = c2,
            };
        }

    private:
        Fp c0, c1, c2;

        // Cubic non-residue used to construct the extension field in montgomery representation. 
        // That is, `NONRESIDUE` is such that the cubic polynomial
        // `f(X) = X^3 - NONRESIDUE` in Fp\[X\] is irreducible in `Fp`.
        constexpr static const constant unsigned long NONREDIDUE = /* =2 */ 8589934590;
    };
}

#endif /* felt_u64_h */