#![allow(unsafe_code)]
#![allow(clippy::inline_always)]
#![allow(clippy::too_many_arguments)]
#![allow(clippy::many_single_char_names)]
#[cfg(target_arch = "x86_64")]
use core::arch::x86_64::*;
#[cfg(any(target_os = "linux", target_os = "macos", target_os = "windows"))]
pub(crate) mod asm;
pub(crate) mod avx2;
pub(crate) mod avx512;
pub(crate) mod sse41;
use super::{BLOCK_LEN, CHUNK_START, IV, PARENT};
#[inline(always)]
pub(crate) const fn counter_low(counter: u64) -> u32 {
counter as u32
}
#[inline(always)]
pub(crate) const fn counter_high(counter: u64) -> u32 {
(counter >> 32) as u32
}
#[cfg(target_arch = "x86_64")]
#[inline(always)]
unsafe fn load_msg_vecs(block: *const u8) -> (__m128i, __m128i, __m128i, __m128i) {
unsafe {
let m0 = _mm_loadu_si128(block.cast());
let m1 = _mm_loadu_si128(block.add(16).cast());
let m2 = _mm_loadu_si128(block.add(32).cast());
let m3 = _mm_loadu_si128(block.add(48).cast());
(m0, m1, m2, m3)
}
}
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "sse4.1,ssse3")]
pub(crate) unsafe fn compress_in_place_sse41_bytes(
chaining_value: &mut [u32; 8],
block: *const u8,
counter: u64,
block_len: u32,
flags: u32,
) {
unsafe {
let (m0, m1, m2, m3) = load_msg_vecs(block);
let [row0, row1, row2, row3] = compress_pre_sse41_impl(chaining_value, m0, m1, m2, m3, counter, block_len, flags);
_mm_storeu_si128(chaining_value.as_mut_ptr().cast(), _mm_xor_si128(row0, row2));
_mm_storeu_si128(chaining_value.as_mut_ptr().add(4).cast(), _mm_xor_si128(row1, row3));
}
}
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "sse4.1,ssse3")]
pub(crate) unsafe fn compress_cv_sse41_bytes(
chaining_value: &[u32; 8],
block: *const u8,
counter: u64,
block_len: u32,
flags: u32,
) -> [u32; 8] {
unsafe {
let (m0, m1, m2, m3) = load_msg_vecs(block);
let [row0, row1, row2, row3] = compress_pre_sse41_impl(chaining_value, m0, m1, m2, m3, counter, block_len, flags);
let row0 = _mm_xor_si128(row0, row2);
let row1 = _mm_xor_si128(row1, row3);
let mut out = [0u32; 8];
_mm_storeu_si128(out.as_mut_ptr().cast(), row0);
_mm_storeu_si128(out.as_mut_ptr().add(4).cast(), row1);
out
}
}
pub(crate) unsafe fn compress_cv_avx2_bytes(
chaining_value: &[u32; 8],
block: *const u8,
counter: u64,
block_len: u32,
flags: u32,
) -> [u32; 8] {
unsafe {
let (m0, m1, m2, m3) = load_msg_vecs(block);
compress_cv_avx2(chaining_value, m0, m1, m2, m3, counter, block_len, flags)
}
}
#[cfg(target_arch = "x86_64")]
#[cfg_attr(
any(target_os = "linux", target_os = "macos", target_os = "windows"),
allow(dead_code)
)]
#[target_feature(enable = "avx512f,avx512vl,avx2,sse4.1,ssse3")]
pub(crate) unsafe fn compress_in_place_avx512_bytes(
chaining_value: &mut [u32; 8],
block: *const u8,
counter: u64,
block_len: u32,
flags: u32,
) {
unsafe {
let (m0, m1, m2, m3) = load_msg_vecs(block);
let [row0, row1, row2, row3] = compress_pre_sse41_impl(chaining_value, m0, m1, m2, m3, counter, block_len, flags);
_mm_storeu_si128(chaining_value.as_mut_ptr().cast(), _mm_xor_si128(row0, row2));
_mm_storeu_si128(chaining_value.as_mut_ptr().add(4).cast(), _mm_xor_si128(row1, row3));
}
}
#[cfg(target_arch = "x86_64")]
#[cfg_attr(
any(target_os = "linux", target_os = "macos", target_os = "windows"),
allow(dead_code)
)]
#[target_feature(enable = "avx512f,avx512vl,avx2,sse4.1,ssse3")]
pub(crate) unsafe fn compress_cv_avx512_bytes(
chaining_value: &[u32; 8],
block: *const u8,
counter: u64,
block_len: u32,
flags: u32,
) -> [u32; 8] {
unsafe {
let (m0, m1, m2, m3) = load_msg_vecs(block);
compress_cv_avx512(chaining_value, m0, m1, m2, m3, counter, block_len, flags)
}
}
#[cfg(target_arch = "x86_64")]
#[inline(always)]
unsafe fn rot16_sse41(a: __m128i) -> __m128i {
unsafe { _mm_or_si128(_mm_srli_epi32(a, 16), _mm_slli_epi32(a, 16)) }
}
#[cfg(target_arch = "x86_64")]
#[inline(always)]
unsafe fn rot12_sse41(a: __m128i) -> __m128i {
unsafe { _mm_or_si128(_mm_srli_epi32(a, 12), _mm_slli_epi32(a, 20)) }
}
#[cfg(target_arch = "x86_64")]
#[inline(always)]
unsafe fn rot8_sse41(a: __m128i) -> __m128i {
unsafe { _mm_or_si128(_mm_srli_epi32(a, 8), _mm_slli_epi32(a, 24)) }
}
#[cfg(target_arch = "x86_64")]
#[inline(always)]
unsafe fn rot7_sse41(a: __m128i) -> __m128i {
unsafe { _mm_or_si128(_mm_srli_epi32(a, 7), _mm_slli_epi32(a, 25)) }
}
#[cfg(target_arch = "x86_64")]
#[inline(always)]
unsafe fn g1_sse41(row0: &mut __m128i, row1: &mut __m128i, row2: &mut __m128i, row3: &mut __m128i, m: __m128i) {
unsafe {
*row0 = _mm_add_epi32(_mm_add_epi32(*row0, m), *row1);
*row3 = _mm_xor_si128(*row3, *row0);
*row3 = rot16_sse41(*row3);
*row2 = _mm_add_epi32(*row2, *row3);
*row1 = _mm_xor_si128(*row1, *row2);
*row1 = rot12_sse41(*row1);
}
}
#[cfg(target_arch = "x86_64")]
#[inline(always)]
unsafe fn g2_sse41(row0: &mut __m128i, row1: &mut __m128i, row2: &mut __m128i, row3: &mut __m128i, m: __m128i) {
unsafe {
*row0 = _mm_add_epi32(_mm_add_epi32(*row0, m), *row1);
*row3 = _mm_xor_si128(*row3, *row0);
*row3 = rot8_sse41(*row3);
*row2 = _mm_add_epi32(*row2, *row3);
*row1 = _mm_xor_si128(*row1, *row2);
*row1 = rot7_sse41(*row1);
}
}
macro_rules! _MM_SHUFFLE {
($z:expr, $y:expr, $x:expr, $w:expr) => {
($z << 6) | ($y << 4) | ($x << 2) | $w
};
}
macro_rules! shuffle2 {
($a:expr, $b:expr, $c:expr) => {
_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps($a), _mm_castsi128_ps($b), $c))
};
}
#[cfg(target_arch = "x86_64")]
#[inline(always)]
unsafe fn diagonalize_sse41(row0: &mut __m128i, row2: &mut __m128i, row3: &mut __m128i) {
unsafe {
*row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE!(2, 1, 0, 3));
*row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE!(1, 0, 3, 2));
*row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE!(0, 3, 2, 1));
}
}
#[cfg(target_arch = "x86_64")]
#[inline(always)]
unsafe fn undiagonalize_sse41(row0: &mut __m128i, row2: &mut __m128i, row3: &mut __m128i) {
unsafe {
*row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE!(0, 3, 2, 1));
*row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE!(1, 0, 3, 2));
*row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE!(2, 1, 0, 3));
}
}
#[cfg(target_arch = "x86_64")]
#[inline(always)]
unsafe fn compress_pre_sse41_impl(
chaining_value: &[u32; 8],
mut m0: __m128i,
mut m1: __m128i,
mut m2: __m128i,
mut m3: __m128i,
counter: u64,
block_len: u32,
flags: u32,
) -> [__m128i; 4] {
unsafe {
let mut row0 = _mm_loadu_si128(chaining_value.as_ptr().cast());
let mut row1 = _mm_loadu_si128(chaining_value.as_ptr().add(4).cast());
let mut row2 = _mm_setr_epi32(
IV[0].cast_signed(),
IV[1].cast_signed(),
IV[2].cast_signed(),
IV[3].cast_signed(),
);
let mut row3 = _mm_setr_epi32(
(counter as u32).cast_signed(),
((counter >> 32) as u32).cast_signed(),
block_len.cast_signed(),
flags.cast_signed(),
);
let mut t0;
let mut t1;
let mut t2;
let mut t3;
let mut tt;
t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(2, 0, 2, 0));
g1_sse41(&mut row0, &mut row1, &mut row2, &mut row3, t0);
t1 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 3, 1));
g2_sse41(&mut row0, &mut row1, &mut row2, &mut row3, t1);
diagonalize_sse41(&mut row0, &mut row2, &mut row3);
t2 = shuffle2!(m2, m3, _MM_SHUFFLE!(2, 0, 2, 0));
t2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE!(2, 1, 0, 3));
g1_sse41(&mut row0, &mut row1, &mut row2, &mut row3, t2);
t3 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 1, 3, 1));
t3 = _mm_shuffle_epi32(t3, _MM_SHUFFLE!(2, 1, 0, 3));
g2_sse41(&mut row0, &mut row1, &mut row2, &mut row3, t3);
undiagonalize_sse41(&mut row0, &mut row2, &mut row3);
m0 = t0;
m1 = t1;
m2 = t2;
m3 = t3;
macro_rules! next_round_update {
() => {{
t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2));
t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1));
g1_sse41(&mut row0, &mut row1, &mut row2, &mut row3, t0);
t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2));
tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3));
t1 = _mm_blend_epi16(tt, t1, 0xCC);
g2_sse41(&mut row0, &mut row1, &mut row2, &mut row3, t1);
diagonalize_sse41(&mut row0, &mut row2, &mut row3);
t2 = _mm_unpacklo_epi64(m3, m1);
tt = _mm_blend_epi16(t2, m2, 0xC0);
t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0));
g1_sse41(&mut row0, &mut row1, &mut row2, &mut row3, t2);
t3 = _mm_unpackhi_epi32(m1, m3);
tt = _mm_unpacklo_epi32(m2, t3);
t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2));
g2_sse41(&mut row0, &mut row1, &mut row2, &mut row3, t3);
undiagonalize_sse41(&mut row0, &mut row2, &mut row3);
m0 = t0;
m1 = t1;
m2 = t2;
m3 = t3;
}};
}
macro_rules! next_round_final {
() => {{
t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2));
t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1));
g1_sse41(&mut row0, &mut row1, &mut row2, &mut row3, t0);
t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2));
tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3));
t1 = _mm_blend_epi16(tt, t1, 0xCC);
g2_sse41(&mut row0, &mut row1, &mut row2, &mut row3, t1);
diagonalize_sse41(&mut row0, &mut row2, &mut row3);
t2 = _mm_unpacklo_epi64(m3, m1);
tt = _mm_blend_epi16(t2, m2, 0xC0);
t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0));
g1_sse41(&mut row0, &mut row1, &mut row2, &mut row3, t2);
t3 = _mm_unpackhi_epi32(m1, m3);
tt = _mm_unpacklo_epi32(m2, t3);
t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2));
g2_sse41(&mut row0, &mut row1, &mut row2, &mut row3, t3);
undiagonalize_sse41(&mut row0, &mut row2, &mut row3);
}};
}
next_round_update!();
next_round_update!();
next_round_update!();
next_round_update!();
next_round_update!();
next_round_final!();
[row0, row1, row2, row3]
}
}
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2,sse4.1,ssse3")]
unsafe fn compress_cv_avx2(
chaining_value: &[u32; 8],
m0: __m128i,
m1: __m128i,
m2: __m128i,
m3: __m128i,
counter: u64,
block_len: u32,
flags: u32,
) -> [u32; 8] {
unsafe {
let [row0, row1, row2, row3] = compress_pre_sse41_impl(chaining_value, m0, m1, m2, m3, counter, block_len, flags);
let row0 = _mm_xor_si128(row0, row2);
let row1 = _mm_xor_si128(row1, row3);
let mut out = [0u32; 8];
_mm_storeu_si128(out.as_mut_ptr().cast(), row0);
_mm_storeu_si128(out.as_mut_ptr().add(4).cast(), row1);
out
}
}
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx512f,avx512vl,avx2,sse4.1,ssse3")]
unsafe fn compress_cv_avx512(
chaining_value: &[u32; 8],
m0: __m128i,
m1: __m128i,
m2: __m128i,
m3: __m128i,
counter: u64,
block_len: u32,
flags: u32,
) -> [u32; 8] {
unsafe {
let [row0, row1, row2, row3] = compress_pre_sse41_impl(chaining_value, m0, m1, m2, m3, counter, block_len, flags);
let row0 = _mm_xor_si128(row0, row2);
let row1 = _mm_xor_si128(row1, row3);
let mut out = [0u32; 8];
_mm_storeu_si128(out.as_mut_ptr().cast(), row0);
_mm_storeu_si128(out.as_mut_ptr().add(4).cast(), row1);
out
}
}
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2,sse4.1,ssse3")]
pub(crate) unsafe fn compress_avx2(
chaining_value: &[u32; 8],
block_words: &[u32; 16],
counter: u64,
block_len: u32,
flags: u32,
) -> [u32; 16] {
unsafe {
let m0 = _mm_loadu_si128(block_words.as_ptr().cast());
let m1 = _mm_loadu_si128(block_words.as_ptr().add(4).cast());
let m2 = _mm_loadu_si128(block_words.as_ptr().add(8).cast());
let m3 = _mm_loadu_si128(block_words.as_ptr().add(12).cast());
let [mut row0, mut row1, mut row2, mut row3] =
compress_pre_sse41_impl(chaining_value, m0, m1, m2, m3, counter, block_len, flags);
let cv_lo = _mm_loadu_si128(chaining_value.as_ptr().cast());
let cv_hi = _mm_loadu_si128(chaining_value.as_ptr().add(4).cast());
row0 = _mm_xor_si128(row0, row2);
row1 = _mm_xor_si128(row1, row3);
row2 = _mm_xor_si128(row2, cv_lo);
row3 = _mm_xor_si128(row3, cv_hi);
let mut out = [0u32; 16];
_mm_storeu_si128(out.as_mut_ptr().cast(), row0);
_mm_storeu_si128(out.as_mut_ptr().add(4).cast(), row1);
_mm_storeu_si128(out.as_mut_ptr().add(8).cast(), row2);
_mm_storeu_si128(out.as_mut_ptr().add(12).cast(), row3);
out
}
}
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "sse4.1,ssse3")]
pub(crate) unsafe fn compress_sse41(
chaining_value: &[u32; 8],
block_words: &[u32; 16],
counter: u64,
block_len: u32,
flags: u32,
) -> [u32; 16] {
unsafe {
let m0 = _mm_loadu_si128(block_words.as_ptr().cast());
let m1 = _mm_loadu_si128(block_words.as_ptr().add(4).cast());
let m2 = _mm_loadu_si128(block_words.as_ptr().add(8).cast());
let m3 = _mm_loadu_si128(block_words.as_ptr().add(12).cast());
let [mut row0, mut row1, mut row2, mut row3] =
compress_pre_sse41_impl(chaining_value, m0, m1, m2, m3, counter, block_len, flags);
let cv_lo = _mm_loadu_si128(chaining_value.as_ptr().cast());
let cv_hi = _mm_loadu_si128(chaining_value.as_ptr().add(4).cast());
row0 = _mm_xor_si128(row0, row2);
row1 = _mm_xor_si128(row1, row3);
row2 = _mm_xor_si128(row2, cv_lo);
row3 = _mm_xor_si128(row3, cv_hi);
let mut out = [0u32; 16];
_mm_storeu_si128(out.as_mut_ptr().cast(), row0);
_mm_storeu_si128(out.as_mut_ptr().add(4).cast(), row1);
_mm_storeu_si128(out.as_mut_ptr().add(8).cast(), row2);
_mm_storeu_si128(out.as_mut_ptr().add(12).cast(), row3);
out
}
}
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "sse4.1,ssse3")]
pub(crate) unsafe fn chunk_compress_blocks_sse41(
chaining_value: &mut [u32; 8],
chunk_counter: u64,
flags: u32,
blocks_compressed: &mut u8,
blocks: &[u8],
) {
unsafe {
debug_assert_eq!(blocks.len() % BLOCK_LEN, 0);
if blocks.len() == BLOCK_LEN {
let start = if *blocks_compressed == 0 { CHUNK_START } else { 0 };
#[cfg(any(target_os = "linux", target_os = "macos", target_os = "windows"))]
{
asm::compress_in_place_sse41_mut(
chaining_value,
blocks.as_ptr(),
chunk_counter,
BLOCK_LEN as u32,
flags | start,
);
}
#[cfg(not(any(target_os = "linux", target_os = "macos", target_os = "windows")))]
{
compress_in_place_sse41_bytes(
chaining_value,
blocks.as_ptr(),
chunk_counter,
BLOCK_LEN as u32,
flags | start,
);
}
*blocks_compressed = blocks_compressed.wrapping_add(1);
return;
}
let (block_slices, remainder) = blocks.as_chunks::<BLOCK_LEN>();
debug_assert!(remainder.is_empty());
for block_bytes in block_slices {
let start = if *blocks_compressed == 0 { CHUNK_START } else { 0 };
#[cfg(any(target_os = "linux", target_os = "macos", target_os = "windows"))]
{
asm::compress_in_place_sse41_mut(
chaining_value,
block_bytes.as_ptr(),
chunk_counter,
BLOCK_LEN as u32,
flags | start,
);
}
#[cfg(not(any(target_os = "linux", target_os = "macos", target_os = "windows")))]
{
compress_in_place_sse41_bytes(
chaining_value,
block_bytes.as_ptr(),
chunk_counter,
BLOCK_LEN as u32,
flags | start,
);
}
*blocks_compressed = blocks_compressed.wrapping_add(1);
}
}
}
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "sse4.1,ssse3")]
pub(crate) unsafe fn parent_cv_sse41(
left_child_cv: [u32; 8],
right_child_cv: [u32; 8],
key_words: [u32; 8],
flags: u32,
) -> [u32; 8] {
unsafe {
let m0 = _mm_loadu_si128(left_child_cv.as_ptr().cast());
let m1 = _mm_loadu_si128(left_child_cv.as_ptr().add(4).cast());
let m2 = _mm_loadu_si128(right_child_cv.as_ptr().cast());
let m3 = _mm_loadu_si128(right_child_cv.as_ptr().add(4).cast());
let [row0, row1, row2, row3] =
compress_pre_sse41_impl(&key_words, m0, m1, m2, m3, 0, BLOCK_LEN as u32, PARENT | flags);
let row0 = _mm_xor_si128(row0, row2);
let row1 = _mm_xor_si128(row1, row3);
let mut out = [0u32; 8];
_mm_storeu_si128(out.as_mut_ptr().cast(), row0);
_mm_storeu_si128(out.as_mut_ptr().add(4).cast(), row1);
out
}
}
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2,sse4.1,ssse3")]
pub(crate) unsafe fn chunk_compress_blocks_avx2(
chaining_value: &mut [u32; 8],
chunk_counter: u64,
flags: u32,
blocks_compressed: &mut u8,
blocks: &[u8],
) {
unsafe {
debug_assert_eq!(blocks.len() % BLOCK_LEN, 0);
if blocks.len() == BLOCK_LEN {
let start = if *blocks_compressed == 0 { CHUNK_START } else { 0 };
#[cfg(any(target_os = "linux", target_os = "macos", target_os = "windows"))]
{
asm::compress_in_place_avx2_mut(
chaining_value,
blocks.as_ptr(),
chunk_counter,
BLOCK_LEN as u32,
flags | start,
);
}
#[cfg(not(any(target_os = "linux", target_os = "macos", target_os = "windows")))]
{
*chaining_value = compress_cv_avx2_bytes(
chaining_value,
blocks.as_ptr(),
chunk_counter,
BLOCK_LEN as u32,
flags | start,
);
}
*blocks_compressed = blocks_compressed.wrapping_add(1);
return;
}
let (block_slices, remainder) = blocks.as_chunks::<BLOCK_LEN>();
debug_assert!(remainder.is_empty());
for block_bytes in block_slices {
let start = if *blocks_compressed == 0 { CHUNK_START } else { 0 };
#[cfg(any(target_os = "linux", target_os = "macos", target_os = "windows"))]
{
asm::compress_in_place_avx2_mut(
chaining_value,
block_bytes.as_ptr(),
chunk_counter,
BLOCK_LEN as u32,
flags | start,
);
}
#[cfg(not(any(target_os = "linux", target_os = "macos", target_os = "windows")))]
{
*chaining_value = compress_cv_avx2_bytes(
chaining_value,
block_bytes.as_ptr(),
chunk_counter,
BLOCK_LEN as u32,
flags | start,
);
}
*blocks_compressed = blocks_compressed.wrapping_add(1);
}
}
}
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2,sse4.1,ssse3")]
pub(crate) unsafe fn parent_cv_avx2(
left_child_cv: [u32; 8],
right_child_cv: [u32; 8],
key_words: [u32; 8],
flags: u32,
) -> [u32; 8] {
unsafe {
let m0 = _mm_loadu_si128(left_child_cv.as_ptr().cast());
let m1 = _mm_loadu_si128(left_child_cv.as_ptr().add(4).cast());
let m2 = _mm_loadu_si128(right_child_cv.as_ptr().cast());
let m3 = _mm_loadu_si128(right_child_cv.as_ptr().add(4).cast());
compress_cv_avx2(&key_words, m0, m1, m2, m3, 0, BLOCK_LEN as u32, PARENT | flags)
}
}
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx512f,avx512vl,avx2,sse4.1,ssse3")]
pub(crate) unsafe fn compress_avx512(
chaining_value: &[u32; 8],
block_words: &[u32; 16],
counter: u64,
block_len: u32,
flags: u32,
) -> [u32; 16] {
unsafe { avx512::compress_block(chaining_value, block_words, counter, block_len, flags) }
}
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx512f,avx512vl,avx2,sse4.1,ssse3")]
pub(crate) unsafe fn chunk_compress_blocks_avx512(
chaining_value: &mut [u32; 8],
chunk_counter: u64,
flags: u32,
blocks_compressed: &mut u8,
blocks: &[u8],
) {
unsafe {
debug_assert_eq!(blocks.len() % BLOCK_LEN, 0);
if blocks.len() == BLOCK_LEN {
let start = if *blocks_compressed == 0 { CHUNK_START } else { 0 };
#[cfg(any(target_os = "linux", target_os = "macos", target_os = "windows"))]
{
*chaining_value = asm::compress_in_place_avx512(
chaining_value,
blocks.as_ptr(),
chunk_counter,
BLOCK_LEN as u32,
flags | start,
);
}
#[cfg(not(any(target_os = "linux", target_os = "macos", target_os = "windows")))]
{
*chaining_value = compress_cv_avx512_bytes(
chaining_value,
blocks.as_ptr(),
chunk_counter,
BLOCK_LEN as u32,
flags | start,
);
}
*blocks_compressed = blocks_compressed.wrapping_add(1);
return;
}
let (block_slices, remainder) = blocks.as_chunks::<BLOCK_LEN>();
debug_assert!(remainder.is_empty());
for block_bytes in block_slices {
let start = if *blocks_compressed == 0 { CHUNK_START } else { 0 };
#[cfg(any(target_os = "linux", target_os = "macos", target_os = "windows"))]
{
*chaining_value = asm::compress_in_place_avx512(
chaining_value,
block_bytes.as_ptr(),
chunk_counter,
BLOCK_LEN as u32,
flags | start,
);
}
#[cfg(not(any(target_os = "linux", target_os = "macos", target_os = "windows")))]
{
*chaining_value = compress_cv_avx512_bytes(
chaining_value,
block_bytes.as_ptr(),
chunk_counter,
BLOCK_LEN as u32,
flags | start,
);
}
*blocks_compressed = blocks_compressed.wrapping_add(1);
}
}
}
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx512f,avx512vl,avx2,sse4.1,ssse3")]
pub(crate) unsafe fn parent_cv_avx512(
left_child_cv: [u32; 8],
right_child_cv: [u32; 8],
key_words: [u32; 8],
flags: u32,
) -> [u32; 8] {
unsafe {
let m0 = _mm_loadu_si128(left_child_cv.as_ptr().cast());
let m1 = _mm_loadu_si128(left_child_cv.as_ptr().add(4).cast());
let m2 = _mm_loadu_si128(right_child_cv.as_ptr().cast());
let m3 = _mm_loadu_si128(right_child_cv.as_ptr().add(4).cast());
compress_cv_avx512(&key_words, m0, m1, m2, m3, 0, BLOCK_LEN as u32, PARENT | flags)
}
}