use alloc::vec::Vec;
use core::arch::aarch64::{
uint64x2_t, vaddq_u64, vandq_u64, vdupq_n_u64, vgetq_lane_u64, vsetq_lane_u64, vshrq_n_u64,
vsubq_u64,
};
use core::fmt::Debug;
use core::iter::{Product, Sum};
use core::mem::transmute;
use core::ops::{Add, AddAssign, Div, DivAssign, Mul, MulAssign, Neg, Sub, SubAssign};
use p3_field::exponentiation::exp_10540996611094048183;
use p3_field::op_assign_macros::{
impl_add_assign, impl_add_base_field, impl_div_methods, impl_mul_base_field, impl_mul_methods,
impl_packed_value, impl_rng, impl_sub_assign, impl_sub_base_field, impl_sum_prod_base_field,
ring_sum,
};
use p3_field::{
Algebra, Field, InjectiveMonomial, PackedField, PackedFieldPow2, PackedValue,
PermutationMonomial, PrimeCharacteristicRing, PrimeField64,
};
use p3_util::reconstitute_from_base;
use rand::distr::{Distribution, StandardUniform};
use rand::{Rng, RngExt};
use crate::{Goldilocks, P};
const WIDTH: usize = 2;
const EPSILON: u64 = Goldilocks::ORDER_U64.wrapping_neg();
#[derive(Copy, Clone, Debug, Default, PartialEq, Eq)]
#[repr(transparent)]
#[must_use]
pub struct PackedGoldilocksNeon(pub [Goldilocks; WIDTH]);
impl PackedGoldilocksNeon {
#[inline]
#[must_use]
pub(crate) fn to_vector(self) -> uint64x2_t {
unsafe { transmute(self) }
}
#[inline]
pub(crate) fn from_vector(vector: uint64x2_t) -> Self {
unsafe { transmute(vector) }
}
#[inline]
const fn broadcast(value: Goldilocks) -> Self {
Self([value; WIDTH])
}
}
impl From<Goldilocks> for PackedGoldilocksNeon {
fn from(x: Goldilocks) -> Self {
Self::broadcast(x)
}
}
impl Add for PackedGoldilocksNeon {
type Output = Self;
#[inline]
fn add(self, rhs: Self) -> Self {
Self([self.0[0] + rhs.0[0], self.0[1] + rhs.0[1]])
}
}
impl Sub for PackedGoldilocksNeon {
type Output = Self;
#[inline]
fn sub(self, rhs: Self) -> Self {
Self([self.0[0] - rhs.0[0], self.0[1] - rhs.0[1]])
}
}
impl Neg for PackedGoldilocksNeon {
type Output = Self;
#[inline]
fn neg(self) -> Self {
Self([-self.0[0], -self.0[1]])
}
}
impl Mul for PackedGoldilocksNeon {
type Output = Self;
#[inline]
fn mul(self, rhs: Self) -> Self {
Self::from_vector(mul(self.to_vector(), rhs.to_vector()))
}
}
impl_add_assign!(PackedGoldilocksNeon);
impl_sub_assign!(PackedGoldilocksNeon);
impl_mul_methods!(PackedGoldilocksNeon);
ring_sum!(PackedGoldilocksNeon);
impl_rng!(PackedGoldilocksNeon);
impl PrimeCharacteristicRing for PackedGoldilocksNeon {
type PrimeSubfield = Goldilocks;
const ZERO: Self = Self::broadcast(Goldilocks::ZERO);
const ONE: Self = Self::broadcast(Goldilocks::ONE);
const TWO: Self = Self::broadcast(Goldilocks::TWO);
const NEG_ONE: Self = Self::broadcast(Goldilocks::NEG_ONE);
#[inline]
fn from_prime_subfield(f: Self::PrimeSubfield) -> Self {
f.into()
}
#[inline]
fn halve(&self) -> Self {
Self::from_vector(halve(self.to_vector()))
}
#[inline]
fn dot_product<const N: usize>(lhs: &[Self; N], rhs: &[Self; N]) -> Self {
Self::from_fn(|lane| {
let lhs_lane: [Goldilocks; N] = core::array::from_fn(|i| lhs[i].as_slice()[lane]);
let rhs_lane: [Goldilocks; N] = core::array::from_fn(|i| rhs[i].as_slice()[lane]);
Goldilocks::dot_product(&lhs_lane, &rhs_lane)
})
}
#[inline]
fn square(&self) -> Self {
Self::from_vector(square(self.to_vector()))
}
#[inline]
fn zero_vec(len: usize) -> Vec<Self> {
unsafe { reconstitute_from_base(Goldilocks::zero_vec(len * WIDTH)) }
}
}
impl InjectiveMonomial<7> for PackedGoldilocksNeon {}
impl PermutationMonomial<7> for PackedGoldilocksNeon {
fn injective_exp_root_n(&self) -> Self {
exp_10540996611094048183(*self)
}
}
impl_add_base_field!(PackedGoldilocksNeon, Goldilocks);
impl_sub_base_field!(PackedGoldilocksNeon, Goldilocks);
impl_mul_base_field!(PackedGoldilocksNeon, Goldilocks);
impl_div_methods!(PackedGoldilocksNeon, Goldilocks);
impl_sum_prod_base_field!(PackedGoldilocksNeon, Goldilocks);
impl Algebra<Goldilocks> for PackedGoldilocksNeon {
const BATCHED_LC_CHUNK: usize = 2;
#[inline]
fn mixed_dot_product<const N: usize>(a: &[Self; N], f: &[Goldilocks; N]) -> Self {
Self::from_fn(|lane| {
let a_lane: [Goldilocks; N] = core::array::from_fn(|i| a[i].as_slice()[lane]);
Goldilocks::dot_product(&a_lane, f)
})
}
}
impl_packed_value!(PackedGoldilocksNeon, Goldilocks, WIDTH);
unsafe impl PackedField for PackedGoldilocksNeon {
type Scalar = Goldilocks;
}
#[inline]
pub fn interleave_u64(v0: uint64x2_t, v1: uint64x2_t) -> (uint64x2_t, uint64x2_t) {
unsafe {
let a0 = vgetq_lane_u64::<0>(v0);
let a1 = vgetq_lane_u64::<1>(v0);
let b0 = vgetq_lane_u64::<0>(v1);
let b1 = vgetq_lane_u64::<1>(v1);
let r0 = vsetq_lane_u64::<1>(b0, vsetq_lane_u64::<0>(a0, vdupq_n_u64(0)));
let r1 = vsetq_lane_u64::<1>(b1, vsetq_lane_u64::<0>(a1, vdupq_n_u64(0)));
(r0, r1)
}
}
unsafe impl PackedFieldPow2 for PackedGoldilocksNeon {
fn interleave(&self, other: Self, block_len: usize) -> (Self, Self) {
let (v0, v1) = (self.to_vector(), other.to_vector());
let (res0, res1) = match block_len {
1 => interleave_u64(v0, v1),
2 => (v0, v1),
_ => panic!("unsupported block length"),
};
(Self::from_vector(res0), Self::from_vector(res1))
}
}
#[inline(always)]
pub(crate) fn halve(input: uint64x2_t) -> uint64x2_t {
unsafe {
let one = vdupq_n_u64(1);
let zero = vdupq_n_u64(0);
let half = vdupq_n_u64(P.div_ceil(2));
let least_bit = vandq_u64(input, one);
let t = vshrq_n_u64::<1>(input);
let neg_least_bit = vsubq_u64(zero, least_bit);
let maybe_half = vandq_u64(half, neg_least_bit);
vaddq_u64(t, maybe_half)
}
}
#[inline]
fn mul(x: uint64x2_t, y: uint64x2_t) -> uint64x2_t {
unsafe {
let x0 = vgetq_lane_u64::<0>(x);
let x1 = vgetq_lane_u64::<1>(x);
let y0 = vgetq_lane_u64::<0>(y);
let y1 = vgetq_lane_u64::<1>(y);
let (res_0, res_1) = mul_reduce_dual_asm(x0, y0, x1, y1);
transmute([res_0, res_1])
}
}
#[inline(always)]
unsafe fn mul_reduce_dual_asm(a0: u64, b0: u64, a1: u64, b1: u64) -> (u64, u64) {
use core::arch::asm;
let result0: u64;
let result1: u64;
unsafe {
asm!(
"mul {lo0}, {a0}, {b0}",
"mul {lo1}, {a1}, {b1}",
"umulh {hi0}, {a0}, {b0}",
"umulh {hi1}, {a1}, {b1}",
"lsr {hi_hi0}, {hi0}, #32",
"lsr {hi_hi1}, {hi1}, #32",
"subs {tmp0}, {lo0}, {hi_hi0}",
"csetm {adj0:w}, cc",
"subs {tmp1}, {lo1}, {hi_hi1}",
"csetm {adj1:w}, cc",
"sub {tmp0}, {tmp0}, {adj0}",
"sub {tmp1}, {tmp1}, {adj1}",
"and {hi_lo0}, {hi0}, {epsilon}",
"and {hi_lo1}, {hi1}, {epsilon}",
"lsl {t0}, {hi_lo0}, #32",
"lsl {t1}, {hi_lo1}, #32",
"sub {hi_lo_eps0}, {t0}, {hi_lo0}",
"sub {hi_lo_eps1}, {t1}, {hi_lo1}",
"adds {result0}, {tmp0}, {hi_lo_eps0}",
"csetm {adj0:w}, cs",
"adds {result1}, {tmp1}, {hi_lo_eps1}",
"csetm {adj1:w}, cs",
"add {result0}, {result0}, {adj0}",
"add {result1}, {result1}, {adj1}",
a0 = in(reg) a0,
b0 = in(reg) b0,
a1 = in(reg) a1,
b1 = in(reg) b1,
epsilon = in(reg) EPSILON,
lo0 = out(reg) _,
lo1 = out(reg) _,
hi0 = out(reg) _,
hi1 = out(reg) _,
hi_hi0 = out(reg) _,
hi_hi1 = out(reg) _,
tmp0 = out(reg) _,
tmp1 = out(reg) _,
hi_lo0 = out(reg) _,
hi_lo1 = out(reg) _,
t0 = out(reg) _,
t1 = out(reg) _,
hi_lo_eps0 = out(reg) _,
hi_lo_eps1 = out(reg) _,
adj0 = out(reg) _,
adj1 = out(reg) _,
result0 = out(reg) result0,
result1 = out(reg) result1,
options(pure, nomem, nostack),
);
}
(result0, result1)
}
#[inline]
fn square(x: uint64x2_t) -> uint64x2_t {
unsafe {
let x0 = vgetq_lane_u64::<0>(x);
let x1 = vgetq_lane_u64::<1>(x);
let (res_0, res_1) = mul_reduce_dual_asm(x0, x0, x1, x1);
transmute([res_0, res_1])
}
}
#[cfg(test)]
mod tests {
use p3_field_testing::test_packed_field;
use super::{Goldilocks, PackedGoldilocksNeon, WIDTH};
const SPECIAL_VALS: [Goldilocks; WIDTH] =
Goldilocks::new_array([0xFFFF_FFFF_0000_0000, 0xFFFF_FFFF_FFFF_FFFF]);
const ZEROS: PackedGoldilocksNeon = PackedGoldilocksNeon(Goldilocks::new_array([
0x0000_0000_0000_0000,
0xFFFF_FFFF_0000_0001, ]));
const ONES: PackedGoldilocksNeon = PackedGoldilocksNeon(Goldilocks::new_array([
0x0000_0000_0000_0001,
0xFFFF_FFFF_0000_0002, ]));
test_packed_field!(
crate::PackedGoldilocksNeon,
&[super::ZEROS],
&[super::ONES],
crate::PackedGoldilocksNeon(super::SPECIAL_VALS)
);
}