#[cfg(target_arch = "x86_64")]
use core::arch::x86_64::*;
#[cfg(target_arch = "x86_64")]
macro_rules! ror_epi64 {
($value:expr, $left:literal) => {{ _mm256_rol_epi64::<$left>($value) }};
}
#[cfg(target_arch = "x86_64")]
macro_rules! ror_epi64x8 {
($value:expr, $right:literal, $left:literal) => {{ _mm512_or_si512(_mm512_srli_epi64::<$right>($value), _mm512_slli_epi64::<$left>($value)) }};
}
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx512f,avx512vl")]
#[inline]
unsafe fn permute_12_x86_avx512_impl(state: &mut [u64; 5]) {
let mut x0 = _mm256_set1_epi64x(state[0] as i64);
let mut x1 = _mm256_set1_epi64x(state[1] as i64);
let mut x2 = _mm256_set1_epi64x(state[2] as i64);
let mut x3 = _mm256_set1_epi64x(state[3] as i64);
let mut x4 = _mm256_set1_epi64x(state[4] as i64);
let ones = _mm256_set1_epi64x(-1);
for &c in &super::RC {
x2 = _mm256_xor_si256(x2, _mm256_set1_epi64x(c as i64));
x0 = _mm256_xor_si256(x0, x4);
x4 = _mm256_xor_si256(x4, x3);
x2 = _mm256_xor_si256(x2, x1);
let y0 = x0;
let y1 = x1;
let y2 = x2;
let y3 = x3;
let y4 = x4;
x0 = _mm256_ternarylogic_epi64(y0, y1, y2, 0xD2);
x1 = _mm256_ternarylogic_epi64(y1, y2, y3, 0xD2);
x2 = _mm256_ternarylogic_epi64(y2, y3, y4, 0xD2);
x3 = _mm256_ternarylogic_epi64(y3, y4, y0, 0xD2);
x4 = _mm256_ternarylogic_epi64(y4, y0, y1, 0xD2);
x1 = _mm256_xor_si256(x1, x0);
x0 = _mm256_xor_si256(x0, x4);
x3 = _mm256_xor_si256(x3, x2);
x2 = _mm256_xor_si256(x2, ones);
let y0 = x0;
let y1 = x1;
let y2 = x2;
let y3 = x3;
let y4 = x4;
x0 = _mm256_xor_si256(y0, _mm256_xor_si256(ror_epi64!(y0, 45), ror_epi64!(y0, 36)));
x1 = _mm256_xor_si256(y1, _mm256_xor_si256(ror_epi64!(y1, 3), ror_epi64!(y1, 25)));
x2 = _mm256_xor_si256(y2, _mm256_xor_si256(ror_epi64!(y2, 63), ror_epi64!(y2, 58)));
x3 = _mm256_xor_si256(y3, _mm256_xor_si256(ror_epi64!(y3, 54), ror_epi64!(y3, 47)));
x4 = _mm256_xor_si256(y4, _mm256_xor_si256(ror_epi64!(y4, 57), ror_epi64!(y4, 23)));
}
state[0] = _mm256_extract_epi64::<0>(x0) as u64;
state[1] = _mm256_extract_epi64::<0>(x1) as u64;
state[2] = _mm256_extract_epi64::<0>(x2) as u64;
state[3] = _mm256_extract_epi64::<0>(x3) as u64;
state[4] = _mm256_extract_epi64::<0>(x4) as u64;
}
#[cfg(target_arch = "x86_64")]
#[inline]
pub(crate) fn permute_12_x86_avx512(state: &mut [u64; 5]) {
unsafe { permute_12_x86_avx512_impl(state) }
}
#[cfg(target_arch = "x86_64")]
#[cfg_attr(not(any(test, feature = "std")), allow(dead_code))]
#[target_feature(enable = "avx512f,avx512vl")]
#[inline]
unsafe fn permute_12_x86_avx512_x8_impl(states: &mut [[u64; 5]; 8]) {
let mut x0 = _mm512_set_epi64(
states[7][0] as i64,
states[6][0] as i64,
states[5][0] as i64,
states[4][0] as i64,
states[3][0] as i64,
states[2][0] as i64,
states[1][0] as i64,
states[0][0] as i64,
);
let mut x1 = _mm512_set_epi64(
states[7][1] as i64,
states[6][1] as i64,
states[5][1] as i64,
states[4][1] as i64,
states[3][1] as i64,
states[2][1] as i64,
states[1][1] as i64,
states[0][1] as i64,
);
let mut x2 = _mm512_set_epi64(
states[7][2] as i64,
states[6][2] as i64,
states[5][2] as i64,
states[4][2] as i64,
states[3][2] as i64,
states[2][2] as i64,
states[1][2] as i64,
states[0][2] as i64,
);
let mut x3 = _mm512_set_epi64(
states[7][3] as i64,
states[6][3] as i64,
states[5][3] as i64,
states[4][3] as i64,
states[3][3] as i64,
states[2][3] as i64,
states[1][3] as i64,
states[0][3] as i64,
);
let mut x4 = _mm512_set_epi64(
states[7][4] as i64,
states[6][4] as i64,
states[5][4] as i64,
states[4][4] as i64,
states[3][4] as i64,
states[2][4] as i64,
states[1][4] as i64,
states[0][4] as i64,
);
let ones = _mm512_set1_epi64(-1);
for &c in &super::RC {
x2 = _mm512_xor_si512(x2, _mm512_set1_epi64(c as i64));
x0 = _mm512_xor_si512(x0, x4);
x4 = _mm512_xor_si512(x4, x3);
x2 = _mm512_xor_si512(x2, x1);
let y0 = x0;
let y1 = x1;
let y2 = x2;
let y3 = x3;
let y4 = x4;
x0 = _mm512_ternarylogic_epi64(y0, y1, y2, 0xD2);
x1 = _mm512_ternarylogic_epi64(y1, y2, y3, 0xD2);
x2 = _mm512_ternarylogic_epi64(y2, y3, y4, 0xD2);
x3 = _mm512_ternarylogic_epi64(y3, y4, y0, 0xD2);
x4 = _mm512_ternarylogic_epi64(y4, y0, y1, 0xD2);
x1 = _mm512_xor_si512(x1, x0);
x0 = _mm512_xor_si512(x0, x4);
x3 = _mm512_xor_si512(x3, x2);
x2 = _mm512_xor_si512(x2, ones);
let y0 = x0;
let y1 = x1;
let y2 = x2;
let y3 = x3;
let y4 = x4;
x0 = _mm512_xor_si512(y0, _mm512_xor_si512(ror_epi64x8!(y0, 19, 45), ror_epi64x8!(y0, 28, 36)));
x1 = _mm512_xor_si512(y1, _mm512_xor_si512(ror_epi64x8!(y1, 61, 3), ror_epi64x8!(y1, 39, 25)));
x2 = _mm512_xor_si512(y2, _mm512_xor_si512(ror_epi64x8!(y2, 1, 63), ror_epi64x8!(y2, 6, 58)));
x3 = _mm512_xor_si512(y3, _mm512_xor_si512(ror_epi64x8!(y3, 10, 54), ror_epi64x8!(y3, 17, 47)));
x4 = _mm512_xor_si512(y4, _mm512_xor_si512(ror_epi64x8!(y4, 7, 57), ror_epi64x8!(y4, 41, 23)));
}
let mut lanes0 = [0u64; 8];
let mut lanes1 = [0u64; 8];
let mut lanes2 = [0u64; 8];
let mut lanes3 = [0u64; 8];
let mut lanes4 = [0u64; 8];
unsafe {
_mm512_storeu_si512(lanes0.as_mut_ptr().cast(), x0);
_mm512_storeu_si512(lanes1.as_mut_ptr().cast(), x1);
_mm512_storeu_si512(lanes2.as_mut_ptr().cast(), x2);
_mm512_storeu_si512(lanes3.as_mut_ptr().cast(), x3);
_mm512_storeu_si512(lanes4.as_mut_ptr().cast(), x4);
}
for lane in 0..8 {
states[lane][0] = lanes0[lane];
states[lane][1] = lanes1[lane];
states[lane][2] = lanes2[lane];
states[lane][3] = lanes3[lane];
states[lane][4] = lanes4[lane];
}
}
#[cfg(target_arch = "x86_64")]
#[cfg_attr(not(any(test, feature = "std")), allow(dead_code))]
#[inline]
pub(crate) fn permute_12_x86_avx512_x8(states: &mut [[u64; 5]; 8]) {
unsafe { permute_12_x86_avx512_x8_impl(states) }
}