use core::arch::x86_64::*;
use super::{scalar, Vector};
#[derive(Copy, Clone)]
pub struct Impl(());
impl Impl {
#[inline]
#[cfg(feature = "std")]
pub unsafe fn new_unchecked() -> Impl {
Impl(())
}
}
impl Vector for Impl {
#[inline]
fn round_scramble(&self, acc: &mut [u64; 8], secret_end: &[u8; 64]) {
unsafe { round_scramble_avx2(acc, secret_end) }
}
#[inline]
fn accumulate(&self, acc: &mut [u64; 8], stripe: &[u8; 64], secret: &[u8; 64]) {
unsafe { accumulate_avx2(acc, stripe, secret) }
}
}
#[inline]
#[target_feature(enable = "avx2")]
unsafe fn round_scramble_avx2(acc: &mut [u64; 8], secret_end: &[u8; 64]) {
scalar::Impl.round_scramble(acc, secret_end)
}
#[inline]
#[target_feature(enable = "avx2")]
unsafe fn accumulate_avx2(acc: &mut [u64; 8], stripe: &[u8; 64], secret: &[u8; 64]) {
let acc = acc.as_mut_ptr().cast::<__m256i>();
let stripe = stripe.as_ptr().cast::<__m256i>();
let secret = secret.as_ptr().cast::<__m256i>();
unsafe {
for i in 0..2 {
let mut acc_0 = _mm256_loadu_si256(acc.add(i));
let stripe_0 = _mm256_loadu_si256(stripe.add(i));
let secret_0 = _mm256_loadu_si256(secret.add(i));
let value_0 = _mm256_xor_si256(stripe_0, secret_0);
let stripe_swap_0 = _mm256_shuffle_epi32::<0b01_00_11_10>(stripe_0);
acc_0 = _mm256_add_epi64(acc_0, stripe_swap_0);
let value_shift_0 = _mm256_srli_epi64::<32>(value_0);
let product_0 = _mm256_mul_epu32(value_0, value_shift_0);
acc_0 = _mm256_add_epi64(acc_0, product_0);
_mm256_storeu_si256(acc.add(i), acc_0);
}
}
}