use core::arch::asm;
use super::packing::PackedGoldilocksNeon;
use crate::{Goldilocks, P};
const EPSILON: u64 = P.wrapping_neg();
#[inline(always)]
pub(super) unsafe fn mul_asm(a: u64, b: u64) -> u64 {
let _lo: u64;
let _hi: u64;
let _t0: u64;
let _t1: u64;
let _t2: u64;
let result: u64;
unsafe {
asm!(
"mul {lo}, {a}, {b}",
"umulh {hi}, {a}, {b}",
"lsr {t0}, {hi}, #32", "subs {t1}, {lo}, {t0}", "csetm {t2:w}, cc", "sub {t1}, {t1}, {t2}",
"and {t0}, {hi}, {epsilon}", "mul {t0}, {t0}, {epsilon}",
"adds {result}, {t1}, {t0}", "csetm {t2:w}, cs", "add {result}, {result}, {t2}",
a = in(reg) a,
b = in(reg) b,
epsilon = in(reg) EPSILON,
lo = out(reg) _lo,
hi = out(reg) _hi,
t0 = out(reg) _t0,
t1 = out(reg) _t1,
t2 = out(reg) _t2,
result = out(reg) result,
options(pure, nomem, nostack),
);
}
result
}
#[inline(always)]
pub(super) unsafe fn mul_add_asm(a: u64, b: u64, c: u64) -> u64 {
let _lo: u64;
let _hi: u64;
let _t0: u64;
let _t1: u64;
let _t2: u64;
let result: u64;
unsafe {
asm!(
"mul {lo}, {a}, {b}",
"umulh {hi}, {a}, {b}",
"adds {lo}, {lo}, {c}",
"adc {hi}, {hi}, xzr",
"lsr {t0}, {hi}, #32", "subs {t1}, {lo}, {t0}", "csetm {t2:w}, cc", "sub {t1}, {t1}, {t2}",
"and {t0}, {hi}, {epsilon}", "mul {t0}, {t0}, {epsilon}",
"adds {result}, {t1}, {t0}", "csetm {t2:w}, cs", "add {result}, {result}, {t2}",
a = in(reg) a,
b = in(reg) b,
c = in(reg) c,
epsilon = in(reg) EPSILON,
lo = out(reg) _lo,
hi = out(reg) _hi,
t0 = out(reg) _t0,
t1 = out(reg) _t1,
t2 = out(reg) _t2,
result = out(reg) result,
options(pure, nomem, nostack),
);
}
result
}
#[inline(always)]
pub(super) unsafe fn add_asm(a: u64, b: u64) -> u64 {
let result: u64;
let _t0: u64;
let _t1: u64;
let _adj: u64;
unsafe {
asm!(
"subs {t0}, {b}, {p}",
"csel {b_canon}, {t0}, {b}, cs",
"adds {result}, {a}, {b_canon}",
"csetm {adj:w}, cs",
"add {result}, {result}, {adj}",
"subs {t1}, {result}, {p}",
"csel {result}, {t1}, {result}, cs",
a = in(reg) a,
b = in(reg) b,
b_canon = out(reg) _,
p = in(reg) P,
result = out(reg) result,
t0 = out(reg) _t0,
t1 = out(reg) _t1,
adj = out(reg) _adj,
options(pure, nomem, nostack),
);
}
result
}
#[inline]
pub(super) fn unpack_lanes<const WIDTH: usize>(
state: &[PackedGoldilocksNeon; WIDTH],
) -> ([u64; WIDTH], [u64; WIDTH]) {
let lane0: [u64; WIDTH] = core::array::from_fn(|i| state[i].0[0].value);
let lane1: [u64; WIDTH] = core::array::from_fn(|i| state[i].0[1].value);
(lane0, lane1)
}
#[inline]
pub(super) fn pack_lanes<const WIDTH: usize>(
state: &mut [PackedGoldilocksNeon; WIDTH],
lane0: &[u64; WIDTH],
lane1: &[u64; WIDTH],
) {
for i in 0..WIDTH {
state[i] = PackedGoldilocksNeon([Goldilocks::new(lane0[i]), Goldilocks::new(lane1[i])]);
}
}
#[cfg(test)]
pub(super) mod tests {
use alloc::vec::Vec;
use p3_field::{PrimeCharacteristicRing, PrimeField64};
use proptest::prelude::*;
use super::*;
type F = Goldilocks;
fn canon(x: u64) -> u64 {
F::new(x).as_canonical_u64()
}
pub const EDGE_VALUES: &[u64] = &[
0,
1,
2,
EPSILON - 1,
EPSILON,
EPSILON + 1,
1u64 << 31,
(1u64 << 32) + 1,
1u64 << 33,
1u64 << 63,
P - 2,
P - 1, P, P + 1,
P + 2,
18_446_744_069_605_983_184, 18_446_744_073_709_551_599, u64::MAX - 1,
u64::MAX, ];
pub fn danger_u64() -> impl Strategy<Value = u64> {
prop_oneof![
prop::sample::select(EDGE_VALUES.to_vec()),
P..u64::MAX,
P..=P.saturating_add(EPSILON - 1),
any::<u64>(),
]
}
pub fn danger_array<const WIDTH: usize>() -> impl Strategy<Value = [u64; WIDTH]> {
prop::collection::vec(danger_u64(), WIDTH).prop_map(|v: Vec<u64>| {
v.try_into()
.expect("prop::collection::vec produces exactly WIDTH elements")
})
}
#[test]
fn test_add_asm_large_values() {
let a: u64 = 18_446_744_069_605_983_184; let b: u64 = 18_446_744_073_709_551_599; let expected = 4_486_366_141u64;
let got = canon(unsafe { add_asm(a, b) });
assert_eq!(got, expected);
}
#[test]
fn test_add_asm_edge_pairs() {
for &a in EDGE_VALUES {
for &b in EDGE_VALUES {
let expected = (F::new(a) + F::new(b)).as_canonical_u64();
let got = canon(unsafe { add_asm(a, b) });
assert_eq!(got, expected, "add({a}, {b})");
}
}
}
#[test]
fn test_mul_asm_edge_pairs() {
for &a in EDGE_VALUES {
for &b in EDGE_VALUES {
let expected = (F::new(a) * F::new(b)).as_canonical_u64();
let got = canon(unsafe { mul_asm(a, b) });
assert_eq!(got, expected, "mul({a}, {b})");
}
}
}
#[test]
fn test_mul_add_asm_edge_triples() {
for &a in EDGE_VALUES {
for &b in EDGE_VALUES {
for &c in EDGE_VALUES {
let expected = (F::new(a) * F::new(b) + F::new(c)).as_canonical_u64();
let got = canon(unsafe { mul_add_asm(a, b, c) });
assert_eq!(got, expected, "mul_add({a}, {b}, {c})");
}
}
}
}
#[test]
fn test_add_asm_chained_accumulation() {
let mut acc_asm: u64 = P - 1;
let mut acc_ref = F::new(P - 1);
for _ in 0..1000 {
acc_asm = unsafe { add_asm(acc_asm, P - 1) };
acc_ref += F::new(P - 1);
}
assert_eq!(canon(acc_asm), acc_ref.as_canonical_u64());
}
proptest! {
#[test]
fn test_add_asm(a: u64, b: u64) {
let expected = (F::new(a) + F::new(b)).as_canonical_u64();
let got = canon(unsafe { add_asm(a, b) });
prop_assert_eq!(got, expected);
}
#[test]
fn test_mul_asm(a: u64, b: u64) {
let expected = (F::new(a) * F::new(b)).as_canonical_u64();
let got = canon(unsafe { mul_asm(a, b) });
prop_assert_eq!(got, expected);
}
#[test]
fn test_mul_add_asm(a: u64, b: u64, c: u64) {
let expected = (F::new(a) * F::new(b) + F::new(c)).as_canonical_u64();
let got = canon(unsafe { mul_add_asm(a, b, c) });
prop_assert_eq!(got, expected);
}
#[test]
fn test_add_asm_danger(a in danger_u64(), b in danger_u64()) {
let expected = (F::new(a) + F::new(b)).as_canonical_u64();
let got = canon(unsafe { add_asm(a, b) });
prop_assert_eq!(got, expected);
}
#[test]
fn test_mul_asm_danger(a in danger_u64(), b in danger_u64()) {
let expected = (F::new(a) * F::new(b)).as_canonical_u64();
let got = canon(unsafe { mul_asm(a, b) });
prop_assert_eq!(got, expected);
}
#[test]
fn test_mul_add_asm_danger(
a in danger_u64(),
b in danger_u64(),
c in danger_u64(),
) {
let expected = (F::new(a) * F::new(b) + F::new(c)).as_canonical_u64();
let got = canon(unsafe { mul_add_asm(a, b, c) });
prop_assert_eq!(got, expected);
}
#[test]
fn test_unpack_lanes_w8(
lane_a in prop::array::uniform8(any::<u64>()),
lane_b in prop::array::uniform8(any::<u64>()),
) {
let packed: [PackedGoldilocksNeon; 8] =
core::array::from_fn(|i| PackedGoldilocksNeon([F::new(lane_a[i]), F::new(lane_b[i])]));
let (got0, got1) = unpack_lanes(&packed);
for i in 0..8 {
prop_assert_eq!(got0[i], F::new(lane_a[i]).value);
prop_assert_eq!(got1[i], F::new(lane_b[i]).value);
}
}
#[test]
fn test_unpack_lanes_w12(
lane_a in prop::array::uniform12(any::<u64>()),
lane_b in prop::array::uniform12(any::<u64>()),
) {
let packed: [PackedGoldilocksNeon; 12] =
core::array::from_fn(|i| PackedGoldilocksNeon([F::new(lane_a[i]), F::new(lane_b[i])]));
let (got0, got1) = unpack_lanes(&packed);
for i in 0..12 {
prop_assert_eq!(got0[i], F::new(lane_a[i]).value);
prop_assert_eq!(got1[i], F::new(lane_b[i]).value);
}
}
#[test]
fn test_pack_lanes_w8(
lane_a in prop::array::uniform8(any::<u64>()),
lane_b in prop::array::uniform8(any::<u64>()),
) {
let mut packed = [PackedGoldilocksNeon([F::ZERO; 2]); 8];
pack_lanes(&mut packed, &lane_a, &lane_b);
for i in 0..8 {
prop_assert_eq!(packed[i].0[0], F::new(lane_a[i]));
prop_assert_eq!(packed[i].0[1], F::new(lane_b[i]));
}
}
#[test]
fn test_pack_lanes_w12(
lane_a in prop::array::uniform12(any::<u64>()),
lane_b in prop::array::uniform12(any::<u64>()),
) {
let mut packed = [PackedGoldilocksNeon([F::ZERO; 2]); 12];
pack_lanes(&mut packed, &lane_a, &lane_b);
for i in 0..12 {
prop_assert_eq!(packed[i].0[0], F::new(lane_a[i]));
prop_assert_eq!(packed[i].0[1], F::new(lane_b[i]));
}
}
#[test]
fn test_roundtrip_pack_unpack_w8(
lane_a in prop::array::uniform8(any::<u64>()),
lane_b in prop::array::uniform8(any::<u64>()),
) {
let mut packed = [PackedGoldilocksNeon([F::ZERO; 2]); 8];
pack_lanes(&mut packed, &lane_a, &lane_b);
let (out0, out1) = unpack_lanes(&packed);
for i in 0..8 {
prop_assert_eq!(F::new(out0[i]).as_canonical_u64(), F::new(lane_a[i]).as_canonical_u64());
prop_assert_eq!(F::new(out1[i]).as_canonical_u64(), F::new(lane_b[i]).as_canonical_u64());
}
}
#[test]
fn test_roundtrip_pack_unpack_w12(
lane_a in prop::array::uniform12(any::<u64>()),
lane_b in prop::array::uniform12(any::<u64>()),
) {
let mut packed = [PackedGoldilocksNeon([F::ZERO; 2]); 12];
pack_lanes(&mut packed, &lane_a, &lane_b);
let (out0, out1) = unpack_lanes(&packed);
for i in 0..12 {
prop_assert_eq!(F::new(out0[i]).as_canonical_u64(), F::new(lane_a[i]).as_canonical_u64());
prop_assert_eq!(F::new(out1[i]).as_canonical_u64(), F::new(lane_b[i]).as_canonical_u64());
}
}
#[test]
fn test_roundtrip_unpack_pack_w8(
lane_a in prop::array::uniform8(any::<u64>()),
lane_b in prop::array::uniform8(any::<u64>()),
) {
let original: [PackedGoldilocksNeon; 8] =
core::array::from_fn(|i| PackedGoldilocksNeon([F::new(lane_a[i]), F::new(lane_b[i])]));
let (raw0, raw1) = unpack_lanes(&original);
let mut restored = [PackedGoldilocksNeon([F::ZERO; 2]); 8];
pack_lanes(&mut restored, &raw0, &raw1);
for i in 0..8 {
prop_assert_eq!(restored[i].0[0], original[i].0[0]);
prop_assert_eq!(restored[i].0[1], original[i].0[1]);
}
}
#[test]
fn test_roundtrip_unpack_pack_w12(
lane_a in prop::array::uniform12(any::<u64>()),
lane_b in prop::array::uniform12(any::<u64>()),
) {
let original: [PackedGoldilocksNeon; 12] =
core::array::from_fn(|i| PackedGoldilocksNeon([F::new(lane_a[i]), F::new(lane_b[i])]));
let (raw0, raw1) = unpack_lanes(&original);
let mut restored = [PackedGoldilocksNeon([F::ZERO; 2]); 12];
pack_lanes(&mut restored, &raw0, &raw1);
for i in 0..12 {
prop_assert_eq!(restored[i].0[0], original[i].0[0]);
prop_assert_eq!(restored[i].0[1], original[i].0[1]);
}
}
}
}