use crate::backend::scalar::{
BLOCK_STRIPES, FINAL_STRIPE_SECRET_OFFSET, SCRAMBLE_PRIME, SCRAMBLE_SECRET_OFFSET, STRIPE_BYTES,
STRIPE_SECRET_BASE, init_acc, merge_acc,
};
use crate::constants::SECRET_STREAM;
use crate::memory::{r_u64x2, r_u64x2_aligned};
use core::arch::aarch64::*;
#[target_feature(enable = "neon")]
#[inline]
unsafe fn mix_pair(
acc: uint64x2_t,
data: uint64x2_t,
secret: uint64x2_t,
) -> uint64x2_t {
let k = veorq_u64(data, secret);
let lo32 = vmovn_u64(k);
let hi32 = vshrn_n_u64(k, 32);
let prod = vmull_u32(lo32, hi32);
let swapped = vextq_u64(data, data, 1);
vaddq_u64(vaddq_u64(acc, swapped), prod)
}
#[target_feature(enable = "neon")]
#[inline]
unsafe fn mul_u64x2_by_u32(v: uint64x2_t, c: u32) -> uint64x2_t {
let c_vec = vdup_n_u32(c);
let v_lo = vmovn_u64(v);
let v_hi = vshrn_n_u64(v, 32);
let prod_lo = vmull_u32(v_lo, c_vec);
let prod_hi = vmull_u32(v_hi, c_vec);
vaddq_u64(prod_lo, vshlq_n_u64(prod_hi, 32))
}
#[target_feature(enable = "neon")]
#[inline]
unsafe fn scramble_pair(v: uint64x2_t, s: uint64x2_t) -> uint64x2_t {
let shifted = vshrq_n_u64(v, 47);
let mixed = veorq_u64(veorq_u64(v, shifted), s);
unsafe { mul_u64x2_by_u32(mixed, SCRAMBLE_PRIME as u32) }
}
#[target_feature(enable = "neon")]
#[inline]
unsafe fn scramble_neon(
a0: &mut uint64x2_t,
a1: &mut uint64x2_t,
a2: &mut uint64x2_t,
a3: &mut uint64x2_t,
secret_ptr: *const u64,
) {
let s0 = unsafe { r_u64x2_aligned(secret_ptr) };
let s1 = unsafe { r_u64x2_aligned(secret_ptr.add(2)) };
let s2 = unsafe { r_u64x2_aligned(secret_ptr.add(4)) };
let s3 = unsafe { r_u64x2_aligned(secret_ptr.add(6)) };
*a0 = unsafe { scramble_pair(*a0, s0) };
*a1 = unsafe { scramble_pair(*a1, s1) };
*a2 = unsafe { scramble_pair(*a2, s2) };
*a3 = unsafe { scramble_pair(*a3, s3) };
}
#[target_feature(enable = "neon")]
#[inline]
unsafe fn mix_stripe_neon(
a0: &mut uint64x2_t,
a1: &mut uint64x2_t,
a2: &mut uint64x2_t,
a3: &mut uint64x2_t,
data_ptr: *const u8,
secret_ptr: *const u64,
) {
let d0 = unsafe { r_u64x2(data_ptr) };
let d1 = unsafe { r_u64x2(data_ptr.add(16)) };
let d2 = unsafe { r_u64x2(data_ptr.add(32)) };
let d3 = unsafe { r_u64x2(data_ptr.add(48)) };
let s0 = unsafe { r_u64x2_aligned(secret_ptr) };
let s1 = unsafe { r_u64x2_aligned(secret_ptr.add(2)) };
let s2 = unsafe { r_u64x2_aligned(secret_ptr.add(4)) };
let s3 = unsafe { r_u64x2_aligned(secret_ptr.add(6)) };
*a0 = unsafe { mix_pair(*a0, d0, s0) };
*a1 = unsafe { mix_pair(*a1, d1, s1) };
*a2 = unsafe { mix_pair(*a2, d2, s2) };
*a3 = unsafe { mix_pair(*a3, d3, s3) };
}
#[target_feature(enable = "neon")]
pub(crate) unsafe fn hash_bytes_long(ptr: *const u8, len: usize, acc_in: u64) -> u64 {
let init = init_acc(acc_in);
let mut a0 = unsafe { vld1q_u64(init.as_ptr()) };
let mut a1 = unsafe { vld1q_u64(init.as_ptr().add(2)) };
let mut a2 = unsafe { vld1q_u64(init.as_ptr().add(4)) };
let mut a3 = unsafe { vld1q_u64(init.as_ptr().add(6)) };
let secret_base = SECRET_STREAM.as_ptr();
let scramble_secret = unsafe { secret_base.add(SCRAMBLE_SECRET_OFFSET) };
let mut secret_off = STRIPE_SECRET_BASE;
let mut offset = 0usize;
let mut stripe_in_block = 0usize;
while offset + STRIPE_BYTES <= len {
unsafe {
mix_stripe_neon(
&mut a0,
&mut a1,
&mut a2,
&mut a3,
ptr.add(offset),
secret_base.add(secret_off),
);
}
offset += STRIPE_BYTES;
secret_off += 1;
stripe_in_block += 1;
if stripe_in_block == BLOCK_STRIPES {
unsafe {
scramble_neon(&mut a0, &mut a1, &mut a2, &mut a3, scramble_secret);
}
stripe_in_block = 0;
secret_off = STRIPE_SECRET_BASE;
}
}
unsafe {
mix_stripe_neon(
&mut a0,
&mut a1,
&mut a2,
&mut a3,
ptr.add(len - STRIPE_BYTES),
secret_base.add(FINAL_STRIPE_SECRET_OFFSET),
);
}
let mut acc = [0u64; 8];
unsafe {
vst1q_u64(acc.as_mut_ptr(), a0);
vst1q_u64(acc.as_mut_ptr().add(2), a1);
vst1q_u64(acc.as_mut_ptr().add(4), a2);
vst1q_u64(acc.as_mut_ptr().add(6), a3);
}
merge_acc(&acc, len, acc_in)
}