use crate::backend::scalar::{
BLOCK_STRIPES, FINAL_STRIPE_SECRET_OFFSET, SCRAMBLE_PRIME, SCRAMBLE_SECRET_OFFSET, STRIPE_BYTES,
STRIPE_SECRET_BASE, init_acc, merge_acc,
};
use crate::constants::SECRET_STREAM;
use core::arch::x86_64::*;
const SWAP_U64_PAIRS: i32 = 0b01_00_11_10;
#[target_feature(enable = "avx2")]
#[inline]
unsafe fn mix_quad(acc: __m256i, data: __m256i, secret: __m256i) -> __m256i {
let k = _mm256_xor_si256(data, secret);
let hi = _mm256_srli_epi64(k, 32);
let prod = _mm256_mul_epu32(k, hi);
let swapped = _mm256_shuffle_epi32(data, SWAP_U64_PAIRS);
_mm256_add_epi64(_mm256_add_epi64(acc, swapped), prod)
}
#[target_feature(enable = "avx2")]
#[inline]
unsafe fn mul_m256i_by_u32(v: __m256i, c: u32) -> __m256i {
let c_vec = _mm256_set1_epi64x(c as i64);
let v_hi = _mm256_srli_epi64(v, 32);
let prod_lo = _mm256_mul_epu32(v, c_vec);
let prod_hi = _mm256_mul_epu32(v_hi, c_vec);
_mm256_add_epi64(prod_lo, _mm256_slli_epi64(prod_hi, 32))
}
#[target_feature(enable = "avx2")]
#[inline]
unsafe fn scramble_quad(v: __m256i, s: __m256i) -> __m256i {
let shifted = _mm256_srli_epi64(v, 47);
let mixed = _mm256_xor_si256(_mm256_xor_si256(v, shifted), s);
unsafe { mul_m256i_by_u32(mixed, SCRAMBLE_PRIME as u32) }
}
#[target_feature(enable = "avx2")]
#[inline]
unsafe fn scramble_avx2(a0: &mut __m256i, a1: &mut __m256i, secret_ptr: *const u64) {
let s0 = unsafe { _mm256_loadu_si256(secret_ptr.cast::<__m256i>()) };
let s1 = unsafe { _mm256_loadu_si256(secret_ptr.add(4).cast::<__m256i>()) };
*a0 = unsafe { scramble_quad(*a0, s0) };
*a1 = unsafe { scramble_quad(*a1, s1) };
}
#[target_feature(enable = "avx2")]
#[inline]
unsafe fn mix_stripe_avx2(
a0: &mut __m256i,
a1: &mut __m256i,
data_ptr: *const u8,
secret_ptr: *const u64,
) {
let d0 = unsafe { _mm256_loadu_si256(data_ptr.cast::<__m256i>()) };
let d1 = unsafe { _mm256_loadu_si256(data_ptr.add(32).cast::<__m256i>()) };
let s0 = unsafe { _mm256_loadu_si256(secret_ptr.cast::<__m256i>()) };
let s1 = unsafe { _mm256_loadu_si256(secret_ptr.add(4).cast::<__m256i>()) };
*a0 = unsafe { mix_quad(*a0, d0, s0) };
*a1 = unsafe { mix_quad(*a1, d1, s1) };
}
#[target_feature(enable = "avx2")]
pub(crate) unsafe fn hash_bytes_long(ptr: *const u8, len: usize, acc_in: u64) -> u64 {
let init = init_acc(acc_in);
let mut a0 = unsafe { _mm256_loadu_si256(init.as_ptr().cast::<__m256i>()) };
let mut a1 = unsafe { _mm256_loadu_si256(init.as_ptr().add(4).cast::<__m256i>()) };
let secret_base = SECRET_STREAM.as_ptr();
let scramble_secret = unsafe { secret_base.add(SCRAMBLE_SECRET_OFFSET) };
let mut secret_off = STRIPE_SECRET_BASE;
let mut offset = 0usize;
let mut stripe_in_block = 0usize;
while offset + STRIPE_BYTES <= len {
unsafe {
mix_stripe_avx2(
&mut a0,
&mut a1,
ptr.add(offset),
secret_base.add(secret_off),
);
}
offset += STRIPE_BYTES;
secret_off += 1;
stripe_in_block += 1;
if stripe_in_block == BLOCK_STRIPES {
unsafe { scramble_avx2(&mut a0, &mut a1, scramble_secret) };
stripe_in_block = 0;
secret_off = STRIPE_SECRET_BASE;
}
}
unsafe {
mix_stripe_avx2(
&mut a0,
&mut a1,
ptr.add(len - STRIPE_BYTES),
secret_base.add(FINAL_STRIPE_SECRET_OFFSET),
);
}
let mut acc = [0u64; 8];
unsafe {
_mm256_storeu_si256(acc.as_mut_ptr().cast::<__m256i>(), a0);
_mm256_storeu_si256(acc.as_mut_ptr().add(4).cast::<__m256i>(), a1);
}
merge_acc(&acc, len, acc_in)
}