use core::simd::i64x2;
use super::{State, compute_block_scalar_reduction};
#[inline]
pub(super) fn compute_block(state: &mut State, block: &[u8; 16], partial: bool) {
unsafe { compute_block_impl(state, block, partial) }
}
#[target_feature(enable = "v")]
unsafe fn compute_block_impl(state: &mut State, block: &[u8; 16], partial: bool) {
compute_block_scalar_reduction(state, block, partial, |lhs, rhs| {
unsafe { sum4_mul(lhs, rhs) }
});
}
#[target_feature(enable = "v")]
unsafe fn sum4_mul(lhs: [u32; 4], rhs: [u32; 4]) -> u64 {
let a_lo = i64x2::from_array([i64::from(lhs[0]), i64::from(lhs[1])]);
let b_lo = i64x2::from_array([i64::from(rhs[0]), i64::from(rhs[1])]);
let prod_lo = a_lo * b_lo;
let a_hi = i64x2::from_array([i64::from(lhs[2]), i64::from(lhs[3])]);
let b_hi = i64x2::from_array([i64::from(rhs[2]), i64::from(rhs[3])]);
let prod_hi = a_hi * b_hi;
let sum = prod_lo + prod_hi;
let lanes = sum.to_array();
(lanes[0] as u64).wrapping_add(lanes[1] as u64)
}