#![allow(clippy::cast_possible_truncation, clippy::indexing_slicing)]
use core::arch::x86_64::*;
use super::kernels::{SIGMA, init_v, load_msg};
#[inline(always)]
unsafe fn ror32_avx2(x: __m256i) -> __m256i {
unsafe { _mm256_shuffle_epi32(x, 0xB1) }
}
#[inline(always)]
unsafe fn ror24_avx2(x: __m256i) -> __m256i {
#[repr(align(32))]
struct Align32([u8; 32]);
static ROT24: Align32 = Align32([
3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10, 3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10,
]);
unsafe {
let mask = _mm256_load_si256(ROT24.0.as_ptr().cast());
_mm256_shuffle_epi8(x, mask)
}
}
#[inline(always)]
unsafe fn ror16_avx2(x: __m256i) -> __m256i {
#[repr(align(32))]
struct Align32([u8; 32]);
static ROT16: Align32 = Align32([
2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9, 2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9,
]);
unsafe {
let mask = _mm256_load_si256(ROT16.0.as_ptr().cast());
_mm256_shuffle_epi8(x, mask)
}
}
#[inline(always)]
unsafe fn ror63_avx2(x: __m256i) -> __m256i {
unsafe { _mm256_xor_si256(_mm256_srli_epi64(x, 63), _mm256_add_epi64(x, x)) }
}
#[inline(always)]
unsafe fn g_avx2(a: &mut __m256i, b: &mut __m256i, c: &mut __m256i, d: &mut __m256i, mx: __m256i, my: __m256i) {
unsafe {
*a = _mm256_add_epi64(_mm256_add_epi64(*a, *b), mx);
*d = ror32_avx2(_mm256_xor_si256(*d, *a));
*c = _mm256_add_epi64(*c, *d);
*b = ror24_avx2(_mm256_xor_si256(*b, *c));
*a = _mm256_add_epi64(_mm256_add_epi64(*a, *b), my);
*d = ror16_avx2(_mm256_xor_si256(*d, *a));
*c = _mm256_add_epi64(*c, *d);
*b = ror63_avx2(_mm256_xor_si256(*b, *c));
}
}
#[inline(always)]
unsafe fn diagonalize(b: &mut __m256i, c: &mut __m256i, d: &mut __m256i) {
unsafe {
*b = _mm256_permute4x64_epi64(*b, 0x39);
*c = _mm256_permute4x64_epi64(*c, 0x4E);
*d = _mm256_permute4x64_epi64(*d, 0x93);
}
}
#[inline(always)]
unsafe fn undiagonalize(b: &mut __m256i, c: &mut __m256i, d: &mut __m256i) {
unsafe {
*b = _mm256_permute4x64_epi64(*b, 0x93);
*c = _mm256_permute4x64_epi64(*c, 0x4E);
*d = _mm256_permute4x64_epi64(*d, 0x39);
}
}
#[target_feature(enable = "avx2")]
pub(super) unsafe fn compress_avx2(h: &mut [u64; 8], block: &[u8; 128], t: u128, last: bool) {
unsafe {
let m = load_msg(block);
let v = init_v(h, t, last);
let mut a = _mm256_loadu_si256(v.as_ptr().cast()); let mut b = _mm256_loadu_si256(v.as_ptr().add(4).cast()); let mut c = _mm256_loadu_si256(v.as_ptr().add(8).cast()); let mut d = _mm256_loadu_si256(v.as_ptr().add(12).cast());
for round in 0..12u8 {
let s = &SIGMA[(round % 10) as usize];
let mx = _mm256_set_epi64x(
m[s[6] as usize] as i64,
m[s[4] as usize] as i64,
m[s[2] as usize] as i64,
m[s[0] as usize] as i64,
);
let my = _mm256_set_epi64x(
m[s[7] as usize] as i64,
m[s[5] as usize] as i64,
m[s[3] as usize] as i64,
m[s[1] as usize] as i64,
);
g_avx2(&mut a, &mut b, &mut c, &mut d, mx, my);
diagonalize(&mut b, &mut c, &mut d);
let mx = _mm256_set_epi64x(
m[s[14] as usize] as i64,
m[s[12] as usize] as i64,
m[s[10] as usize] as i64,
m[s[8] as usize] as i64,
);
let my = _mm256_set_epi64x(
m[s[15] as usize] as i64,
m[s[13] as usize] as i64,
m[s[11] as usize] as i64,
m[s[9] as usize] as i64,
);
g_avx2(&mut a, &mut b, &mut c, &mut d, mx, my);
undiagonalize(&mut b, &mut c, &mut d);
}
let h0 = _mm256_loadu_si256(h.as_ptr().cast());
let h1 = _mm256_loadu_si256(h.as_ptr().add(4).cast());
_mm256_storeu_si256(h.as_mut_ptr().cast(), _mm256_xor_si256(h0, _mm256_xor_si256(a, c)));
_mm256_storeu_si256(
h.as_mut_ptr().add(4).cast(),
_mm256_xor_si256(h1, _mm256_xor_si256(b, d)),
);
}
}
#[inline(always)]
unsafe fn g_avx512vl(a: &mut __m256i, b: &mut __m256i, c: &mut __m256i, d: &mut __m256i, mx: __m256i, my: __m256i) {
unsafe {
*a = _mm256_add_epi64(_mm256_add_epi64(*a, *b), mx);
*d = _mm256_ror_epi64(_mm256_xor_si256(*d, *a), 32);
*c = _mm256_add_epi64(*c, *d);
*b = _mm256_ror_epi64(_mm256_xor_si256(*b, *c), 24);
*a = _mm256_add_epi64(_mm256_add_epi64(*a, *b), my);
*d = _mm256_ror_epi64(_mm256_xor_si256(*d, *a), 16);
*c = _mm256_add_epi64(*c, *d);
*b = _mm256_ror_epi64(_mm256_xor_si256(*b, *c), 63);
}
}
#[target_feature(enable = "avx512f,avx512vl")]
pub(super) unsafe fn compress_avx512vl(h: &mut [u64; 8], block: &[u8; 128], t: u128, last: bool) {
unsafe {
let m = load_msg(block);
let v = init_v(h, t, last);
let mut a = _mm256_loadu_si256(v.as_ptr().cast()); let mut b = _mm256_loadu_si256(v.as_ptr().add(4).cast()); let mut c = _mm256_loadu_si256(v.as_ptr().add(8).cast()); let mut d = _mm256_loadu_si256(v.as_ptr().add(12).cast());
for round in 0..12u8 {
let s = &SIGMA[(round % 10) as usize];
let mx = _mm256_set_epi64x(
m[s[6] as usize] as i64,
m[s[4] as usize] as i64,
m[s[2] as usize] as i64,
m[s[0] as usize] as i64,
);
let my = _mm256_set_epi64x(
m[s[7] as usize] as i64,
m[s[5] as usize] as i64,
m[s[3] as usize] as i64,
m[s[1] as usize] as i64,
);
g_avx512vl(&mut a, &mut b, &mut c, &mut d, mx, my);
diagonalize(&mut b, &mut c, &mut d);
let mx = _mm256_set_epi64x(
m[s[14] as usize] as i64,
m[s[12] as usize] as i64,
m[s[10] as usize] as i64,
m[s[8] as usize] as i64,
);
let my = _mm256_set_epi64x(
m[s[15] as usize] as i64,
m[s[13] as usize] as i64,
m[s[11] as usize] as i64,
m[s[9] as usize] as i64,
);
g_avx512vl(&mut a, &mut b, &mut c, &mut d, mx, my);
undiagonalize(&mut b, &mut c, &mut d);
}
let h0 = _mm256_loadu_si256(h.as_ptr().cast());
let h1 = _mm256_loadu_si256(h.as_ptr().add(4).cast());
_mm256_storeu_si256(h.as_mut_ptr().cast(), _mm256_xor_si256(h0, _mm256_xor_si256(a, c)));
_mm256_storeu_si256(
h.as_mut_ptr().add(4).cast(),
_mm256_xor_si256(h1, _mm256_xor_si256(b, d)),
);
}
}