use super::{
BLOCK_LEN, CHUNK_LEN, CHUNK_START, OUT_LEN, PARENT, first_8_words, words8_from_le_bytes_32, words16_from_le_bytes_64,
};
use crate::platform::Caps;
#[cfg(target_arch = "aarch64")]
use crate::platform::caps::aarch64;
#[cfg(target_arch = "powerpc64")]
use crate::platform::caps::power;
#[cfg(target_arch = "riscv64")]
use crate::platform::caps::riscv;
#[cfg(target_arch = "s390x")]
use crate::platform::caps::s390x;
#[cfg(target_arch = "x86_64")]
use crate::platform::caps::x86;
pub(crate) type CompressFn = fn(&[u32; 8], &[u32; 16], u64, u32, u32) -> [u32; 16];
pub(crate) type HashManyContiguousFn =
unsafe fn(input: *const u8, num_chunks: usize, key: &[u32; 8], counter: u64, flags: u32, out: *mut u8);
pub(crate) type ChunkCompressBlocksFn = fn(&mut [u32; 8], u64, u32, &mut u8, &[u8]);
#[cfg(target_arch = "x86_64")]
pub(crate) type X86CompressCvBytesFn = unsafe fn(&[u32; 8], *const u8, u64, u32, u32) -> [u32; 8];
#[derive(Clone, Copy)]
pub(crate) struct Kernel {
pub(crate) id: Blake3KernelId,
pub(crate) compress: CompressFn,
pub(crate) chunk_compress_blocks: ChunkCompressBlocksFn,
pub(crate) hash_many_contiguous: HashManyContiguousFn,
#[cfg(target_arch = "x86_64")]
pub(crate) x86_compress_cv_bytes: X86CompressCvBytesFn,
#[cfg(any(test, feature = "diag"))]
#[cfg_attr(test, allow(dead_code))]
pub(crate) name: &'static str,
}
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
#[repr(u8)]
#[non_exhaustive]
pub enum Blake3KernelId {
Portable = 0,
#[cfg(target_arch = "x86_64")]
X86Sse41 = 2,
#[cfg(target_arch = "x86_64")]
X86Avx2 = 3,
#[cfg(target_arch = "x86_64")]
X86Avx512 = 4,
#[cfg(target_arch = "aarch64")]
Aarch64Neon = 5,
#[cfg(target_arch = "s390x")]
S390xVector = 6,
#[cfg(target_arch = "powerpc64")]
PowerVsx = 7,
#[cfg(target_arch = "riscv64")]
RiscvV = 8,
}
impl Blake3KernelId {
#[cfg(any(test, feature = "diag"))]
#[inline]
#[must_use]
pub const fn as_str(self) -> &'static str {
match self {
Self::Portable => "portable",
#[cfg(target_arch = "x86_64")]
Self::X86Sse41 => "x86_64/sse4.1",
#[cfg(target_arch = "x86_64")]
Self::X86Avx2 => "x86_64/avx2",
#[cfg(target_arch = "x86_64")]
Self::X86Avx512 => "x86_64/avx512",
#[cfg(target_arch = "aarch64")]
Self::Aarch64Neon => "aarch64/neon",
#[cfg(target_arch = "s390x")]
Self::S390xVector => "s390x/vector",
#[cfg(target_arch = "powerpc64")]
Self::PowerVsx => "powerpc64/vsx",
#[cfg(target_arch = "riscv64")]
Self::RiscvV => "riscv64/v",
}
}
#[inline]
#[must_use]
pub const fn simd_degree(self) -> usize {
match self {
Self::Portable => 1,
#[cfg(target_arch = "x86_64")]
Self::X86Sse41 => 4,
#[cfg(target_arch = "x86_64")]
Self::X86Avx2 => 8,
#[cfg(target_arch = "x86_64")]
Self::X86Avx512 => 16,
#[cfg(target_arch = "aarch64")]
Self::Aarch64Neon => 4, #[cfg(target_arch = "s390x")]
Self::S390xVector => 4,
#[cfg(target_arch = "powerpc64")]
Self::PowerVsx => 4,
#[cfg(target_arch = "riscv64")]
Self::RiscvV => 4,
}
}
}
#[must_use]
pub(crate) fn kernel(id: Blake3KernelId) -> Kernel {
match id {
Blake3KernelId::Portable => Kernel {
id,
compress: super::compress,
chunk_compress_blocks: chunk_compress_blocks_portable,
hash_many_contiguous: hash_many_contiguous_portable,
#[cfg(target_arch = "x86_64")]
x86_compress_cv_bytes: x86_compress_cv_portable_wrapper,
#[cfg(any(test, feature = "diag"))]
name: id.as_str(),
},
#[cfg(target_arch = "x86_64")]
Blake3KernelId::X86Sse41 => Kernel {
id,
compress: compress_sse41_wrapper,
chunk_compress_blocks: chunk_compress_blocks_sse41_wrapper,
hash_many_contiguous: hash_many_contiguous_sse41_wrapper,
x86_compress_cv_bytes: x86_compress_cv_sse41_wrapper,
#[cfg(any(test, feature = "diag"))]
name: id.as_str(),
},
#[cfg(target_arch = "x86_64")]
Blake3KernelId::X86Avx2 => Kernel {
id,
compress: compress_avx2_wrapper,
chunk_compress_blocks: chunk_compress_blocks_avx2_wrapper,
hash_many_contiguous: hash_many_contiguous_avx2_wrapper,
x86_compress_cv_bytes: x86_compress_cv_avx2_wrapper,
#[cfg(any(test, feature = "diag"))]
name: id.as_str(),
},
#[cfg(target_arch = "x86_64")]
Blake3KernelId::X86Avx512 => Kernel {
id,
compress: compress_avx512_wrapper,
chunk_compress_blocks: chunk_compress_blocks_avx512_wrapper,
hash_many_contiguous: hash_many_contiguous_avx512_wrapper,
x86_compress_cv_bytes: x86_compress_cv_avx512_wrapper,
#[cfg(any(test, feature = "diag"))]
name: id.as_str(),
},
#[cfg(target_arch = "aarch64")]
Blake3KernelId::Aarch64Neon => Kernel {
id,
compress: compress_neon_wrapper,
chunk_compress_blocks: chunk_compress_blocks_neon_wrapper,
hash_many_contiguous: hash_many_contiguous_neon_wrapper,
#[cfg(any(test, feature = "diag"))]
name: id.as_str(),
},
#[cfg(target_arch = "s390x")]
Blake3KernelId::S390xVector => Kernel {
id,
compress: compress_s390x_vector_wrapper,
chunk_compress_blocks: chunk_compress_blocks_s390x_vector_wrapper,
hash_many_contiguous: hash_many_contiguous_s390x_vector_wrapper,
#[cfg(any(test, feature = "diag"))]
name: id.as_str(),
},
#[cfg(target_arch = "powerpc64")]
Blake3KernelId::PowerVsx => Kernel {
id,
compress: compress_power_vsx_wrapper,
chunk_compress_blocks: chunk_compress_blocks_power_vsx_wrapper,
hash_many_contiguous: hash_many_contiguous_power_vsx_wrapper,
#[cfg(any(test, feature = "diag"))]
name: id.as_str(),
},
#[cfg(target_arch = "riscv64")]
Blake3KernelId::RiscvV => Kernel {
id,
compress: compress_riscv_v_wrapper,
chunk_compress_blocks: chunk_compress_blocks_riscv_v_wrapper,
hash_many_contiguous: hash_many_contiguous_riscv_v_wrapper,
#[cfg(any(test, feature = "diag"))]
name: id.as_str(),
},
}
}
#[inline(always)]
pub(crate) fn chunk_compress_blocks_inline(
id: Blake3KernelId,
chaining_value: &mut [u32; 8],
chunk_counter: u64,
flags: u32,
blocks_compressed: &mut u8,
blocks: &[u8],
) {
match id {
Blake3KernelId::Portable => {
chunk_compress_blocks_portable(chaining_value, chunk_counter, flags, blocks_compressed, blocks)
}
#[cfg(target_arch = "x86_64")]
Blake3KernelId::X86Sse41 => {
chunk_compress_blocks_sse41_wrapper(chaining_value, chunk_counter, flags, blocks_compressed, blocks)
}
#[cfg(target_arch = "x86_64")]
Blake3KernelId::X86Avx2 => {
chunk_compress_blocks_avx2_wrapper(chaining_value, chunk_counter, flags, blocks_compressed, blocks)
}
#[cfg(target_arch = "x86_64")]
Blake3KernelId::X86Avx512 => {
chunk_compress_blocks_avx512_wrapper(chaining_value, chunk_counter, flags, blocks_compressed, blocks)
}
#[cfg(target_arch = "aarch64")]
Blake3KernelId::Aarch64Neon => {
chunk_compress_blocks_neon_wrapper(chaining_value, chunk_counter, flags, blocks_compressed, blocks)
}
#[cfg(target_arch = "s390x")]
Blake3KernelId::S390xVector => {
chunk_compress_blocks_s390x_vector_wrapper(chaining_value, chunk_counter, flags, blocks_compressed, blocks)
}
#[cfg(target_arch = "powerpc64")]
Blake3KernelId::PowerVsx => {
chunk_compress_blocks_power_vsx_wrapper(chaining_value, chunk_counter, flags, blocks_compressed, blocks)
}
#[cfg(target_arch = "riscv64")]
Blake3KernelId::RiscvV => {
chunk_compress_blocks_riscv_v_wrapper(chaining_value, chunk_counter, flags, blocks_compressed, blocks)
}
}
}
#[inline(always)]
pub(crate) unsafe fn compress_block_asm_inline(
id: Blake3KernelId,
chaining_value: &mut [u32; 8],
block: &[u8; BLOCK_LEN],
chunk_counter: u64,
flags: u32,
) {
#[cfg(all(
target_arch = "x86_64",
any(target_os = "linux", target_os = "macos", target_os = "windows")
))]
match id {
Blake3KernelId::X86Avx512 => {
unsafe {
super::x86_64::asm::compress_in_place_avx512_mut(
chaining_value,
block.as_ptr(),
chunk_counter,
BLOCK_LEN as u32,
flags,
);
}
return;
}
Blake3KernelId::X86Avx2 => {
unsafe {
super::x86_64::asm::compress_in_place_avx2_mut(
chaining_value,
block.as_ptr(),
chunk_counter,
BLOCK_LEN as u32,
flags,
);
}
return;
}
Blake3KernelId::X86Sse41 => {
unsafe {
super::x86_64::asm::compress_in_place_sse41_mut(
chaining_value,
block.as_ptr(),
chunk_counter,
BLOCK_LEN as u32,
flags,
);
}
return;
}
_ => {}
}
let _ = id;
let block_words = words16_from_le_bytes_64(block);
*chaining_value = first_8_words((super::compress)(
chaining_value,
&block_words,
chunk_counter,
BLOCK_LEN as u32,
flags,
));
}
#[inline(always)]
pub(crate) unsafe fn hash_many_contiguous_inline(
id: Blake3KernelId,
input: *const u8,
num_chunks: usize,
key: &[u32; 8],
counter: u64,
flags: u32,
out: *mut u8,
) {
match id {
Blake3KernelId::Portable => {
unsafe { hash_many_contiguous_portable(input, num_chunks, key, counter, flags, out) }
}
#[cfg(target_arch = "x86_64")]
Blake3KernelId::X86Sse41 => {
unsafe { hash_many_contiguous_sse41_wrapper(input, num_chunks, key, counter, flags, out) }
}
#[cfg(target_arch = "x86_64")]
Blake3KernelId::X86Avx2 => {
unsafe { hash_many_contiguous_avx2_wrapper(input, num_chunks, key, counter, flags, out) }
}
#[cfg(target_arch = "x86_64")]
Blake3KernelId::X86Avx512 => {
unsafe { hash_many_contiguous_avx512_wrapper(input, num_chunks, key, counter, flags, out) }
}
#[cfg(target_arch = "aarch64")]
Blake3KernelId::Aarch64Neon => {
unsafe { hash_many_contiguous_neon_wrapper(input, num_chunks, key, counter, flags, out) }
}
#[cfg(target_arch = "s390x")]
Blake3KernelId::S390xVector => {
unsafe { hash_many_contiguous_s390x_vector_wrapper(input, num_chunks, key, counter, flags, out) }
}
#[cfg(target_arch = "powerpc64")]
Blake3KernelId::PowerVsx => {
unsafe { hash_many_contiguous_power_vsx_wrapper(input, num_chunks, key, counter, flags, out) }
}
#[cfg(target_arch = "riscv64")]
Blake3KernelId::RiscvV => {
unsafe { hash_many_contiguous_riscv_v_wrapper(input, num_chunks, key, counter, flags, out) }
}
}
}
#[inline(always)]
pub(crate) fn parent_cv_inline(
id: Blake3KernelId,
left_child_cv: [u32; 8],
right_child_cv: [u32; 8],
key_words: [u32; 8],
flags: u32,
) -> [u32; 8] {
match id {
Blake3KernelId::Portable => parent_cv_portable(left_child_cv, right_child_cv, key_words, flags),
#[cfg(target_arch = "x86_64")]
Blake3KernelId::X86Sse41 => parent_cv_sse41_wrapper(left_child_cv, right_child_cv, key_words, flags),
#[cfg(target_arch = "x86_64")]
Blake3KernelId::X86Avx2 => parent_cv_avx2_wrapper(left_child_cv, right_child_cv, key_words, flags),
#[cfg(target_arch = "x86_64")]
Blake3KernelId::X86Avx512 => parent_cv_avx512_wrapper(left_child_cv, right_child_cv, key_words, flags),
#[cfg(target_arch = "aarch64")]
Blake3KernelId::Aarch64Neon => parent_cv_neon_wrapper(left_child_cv, right_child_cv, key_words, flags),
#[cfg(target_arch = "s390x")]
Blake3KernelId::S390xVector => parent_cv_s390x_vector_wrapper(left_child_cv, right_child_cv, key_words, flags),
#[cfg(target_arch = "powerpc64")]
Blake3KernelId::PowerVsx => parent_cv_power_vsx_wrapper(left_child_cv, right_child_cv, key_words, flags),
#[cfg(target_arch = "riscv64")]
Blake3KernelId::RiscvV => parent_cv_riscv_v_wrapper(left_child_cv, right_child_cv, key_words, flags),
}
}
#[inline(always)]
pub(crate) fn compress_block_inline(
id: Blake3KernelId,
chaining_value: &[u32; 8],
block_words: &[u32; 16],
counter: u64,
block_len: u32,
flags: u32,
) -> [u32; 16] {
match id {
Blake3KernelId::Portable => super::compress(chaining_value, block_words, counter, block_len, flags),
#[cfg(target_arch = "x86_64")]
Blake3KernelId::X86Sse41 => compress_sse41_wrapper(chaining_value, block_words, counter, block_len, flags),
#[cfg(target_arch = "x86_64")]
Blake3KernelId::X86Avx2 => compress_avx2_wrapper(chaining_value, block_words, counter, block_len, flags),
#[cfg(target_arch = "x86_64")]
Blake3KernelId::X86Avx512 => compress_avx512_wrapper(chaining_value, block_words, counter, block_len, flags),
#[cfg(target_arch = "aarch64")]
Blake3KernelId::Aarch64Neon => compress_neon_wrapper(chaining_value, block_words, counter, block_len, flags),
#[cfg(target_arch = "s390x")]
Blake3KernelId::S390xVector => {
compress_s390x_vector_wrapper(chaining_value, block_words, counter, block_len, flags)
}
#[cfg(target_arch = "powerpc64")]
Blake3KernelId::PowerVsx => compress_power_vsx_wrapper(chaining_value, block_words, counter, block_len, flags),
#[cfg(target_arch = "riscv64")]
Blake3KernelId::RiscvV => compress_riscv_v_wrapper(chaining_value, block_words, counter, block_len, flags),
}
}
#[inline]
pub(crate) fn root_output_block_bytes(
id: Blake3KernelId,
chaining_value: &[u32; 8],
block_bytes: &[u8; BLOCK_LEN],
counter: u64,
block_len: u32,
flags: u32,
out: &mut [u8; 2 * OUT_LEN],
) {
root_output_block_bytes_inline(id, chaining_value, block_bytes, counter, block_len, flags, out);
}
#[inline(always)]
fn write_root_output_words(out: &mut [u8; 2 * OUT_LEN], words: &[u32; 16]) {
if cfg!(target_endian = "little") {
unsafe { core::ptr::copy_nonoverlapping(words.as_ptr().cast::<u8>(), out.as_mut_ptr(), 2 * OUT_LEN) };
return;
}
for (idx, word) in words.iter().copied().enumerate() {
let offset = idx * 4;
out[offset..offset + 4].copy_from_slice(&word.to_le_bytes());
}
}
#[inline(always)]
fn root_output_block_words_inline(
id: Blake3KernelId,
chaining_value: &[u32; 8],
block_words: &[u32; 16],
counter: u64,
block_len: u32,
flags: u32,
out: &mut [u8; 2 * OUT_LEN],
) {
#[cfg(target_arch = "x86_64")]
{
match id {
Blake3KernelId::X86Avx512 => {
unsafe {
super::x86_64::avx512::root_output_blocks1(
chaining_value,
block_words,
counter,
block_len,
flags,
out.as_mut_ptr(),
);
}
return;
}
Blake3KernelId::X86Avx2 => {
unsafe {
super::x86_64::avx2::root_output_blocks1(
chaining_value,
block_words,
counter,
block_len,
flags,
out.as_mut_ptr(),
);
}
return;
}
Blake3KernelId::X86Sse41 => {
unsafe {
super::x86_64::sse41::root_output_blocks1(
chaining_value,
block_words,
counter,
block_len,
flags,
out.as_mut_ptr(),
);
}
return;
}
_ => {}
}
}
#[cfg(target_arch = "aarch64")]
if id == Blake3KernelId::Aarch64Neon {
let words = super::compress(chaining_value, block_words, counter, block_len, flags);
write_root_output_words(out, &words);
return;
}
#[cfg(target_arch = "s390x")]
if id == Blake3KernelId::S390xVector {
unsafe {
root_output_blocks1_s390x_vector(chaining_value, block_words, counter, block_len, flags, out.as_mut_ptr())
};
return;
}
#[cfg(target_arch = "powerpc64")]
if id == Blake3KernelId::PowerVsx {
unsafe { root_output_blocks1_power_vsx(chaining_value, block_words, counter, block_len, flags, out.as_mut_ptr()) };
return;
}
#[cfg(target_arch = "riscv64")]
if id == Blake3KernelId::RiscvV {
unsafe { root_output_blocks1_riscv_v(chaining_value, block_words, counter, block_len, flags, out.as_mut_ptr()) };
return;
}
let words = compress_block_inline(id, chaining_value, block_words, counter, block_len, flags);
write_root_output_words(out, &words);
}
#[inline(always)]
fn root_output_block_bytes_inline(
id: Blake3KernelId,
chaining_value: &[u32; 8],
block_bytes: &[u8; BLOCK_LEN],
counter: u64,
block_len: u32,
flags: u32,
out: &mut [u8; 2 * OUT_LEN],
) {
#[cfg(target_arch = "x86_64")]
{
match id {
Blake3KernelId::X86Avx512 | Blake3KernelId::X86Avx2 | Blake3KernelId::X86Sse41 => {
unsafe {
super::x86_64::sse41::root_output_blocks1_bytes(
chaining_value,
block_bytes,
counter,
block_len,
flags,
out.as_mut_ptr(),
);
}
return;
}
_ => {}
}
}
let block_words = super::words16_from_le_bytes_64(block_bytes);
root_output_block_words_inline(id, chaining_value, &block_words, counter, block_len, flags, out);
}
#[cfg(target_arch = "x86_64")]
#[inline(always)]
fn root_output_tail_block_mut(out: &mut [u8]) -> &mut [u8; 2 * OUT_LEN] {
debug_assert!(out.len() >= 2 * OUT_LEN);
unsafe { &mut *out.as_mut_ptr().cast::<[u8; 2 * OUT_LEN]>() }
}
#[inline(always)]
pub(crate) fn root_output_blocks_bytes_into_inline(
id: Blake3KernelId,
chaining_value: &[u32; 8],
block_words: &[u32; 16],
mut output_block_counter: u64,
block_len: u32,
flags: u32,
mut out: &mut [u8],
) {
debug_assert!(out.len().is_multiple_of(2 * OUT_LEN));
while !out.is_empty() {
let blocks_remaining = out.len() / (2 * OUT_LEN);
#[cfg(not(target_arch = "x86_64"))]
let _ = blocks_remaining;
#[cfg(target_arch = "x86_64")]
{
match id {
Blake3KernelId::X86Avx512 if blocks_remaining >= 16 => {
unsafe {
super::x86_64::avx512::root_output_blocks16(
chaining_value,
block_words,
output_block_counter,
block_len,
flags,
out.as_mut_ptr(),
);
}
output_block_counter = output_block_counter.wrapping_add(16);
out = &mut out[16 * 2 * OUT_LEN..];
continue;
}
Blake3KernelId::X86Avx512 if blocks_remaining >= 8 => {
unsafe {
super::x86_64::avx2::root_output_blocks8(
chaining_value,
block_words,
output_block_counter,
block_len,
flags,
out.as_mut_ptr(),
);
}
output_block_counter = output_block_counter.wrapping_add(8);
out = &mut out[8 * 2 * OUT_LEN..];
continue;
}
Blake3KernelId::X86Avx512 if blocks_remaining >= 4 => {
unsafe {
super::x86_64::sse41::root_output_blocks4(
chaining_value,
block_words,
output_block_counter,
block_len,
flags,
out.as_mut_ptr(),
);
}
output_block_counter = output_block_counter.wrapping_add(4);
out = &mut out[4 * 2 * OUT_LEN..];
continue;
}
Blake3KernelId::X86Avx512 if blocks_remaining >= 2 => {
unsafe {
super::x86_64::avx512::root_output_blocks2(
chaining_value,
block_words,
output_block_counter,
block_len,
flags,
out.as_mut_ptr(),
);
}
output_block_counter = output_block_counter.wrapping_add(2);
out = &mut out[2 * 2 * OUT_LEN..];
continue;
}
Blake3KernelId::X86Avx512 => {
let block_out = root_output_tail_block_mut(out);
root_output_block_words_inline(
id,
chaining_value,
block_words,
output_block_counter,
block_len,
flags,
block_out,
);
output_block_counter = output_block_counter.wrapping_add(1);
out = &mut out[2 * OUT_LEN..];
continue;
}
Blake3KernelId::X86Avx2 if blocks_remaining >= 8 => {
unsafe {
super::x86_64::avx2::root_output_blocks8(
chaining_value,
block_words,
output_block_counter,
block_len,
flags,
out.as_mut_ptr(),
);
}
output_block_counter = output_block_counter.wrapping_add(8);
out = &mut out[8 * 2 * OUT_LEN..];
continue;
}
Blake3KernelId::X86Avx2 if blocks_remaining >= 4 => {
unsafe {
super::x86_64::sse41::root_output_blocks4(
chaining_value,
block_words,
output_block_counter,
block_len,
flags,
out.as_mut_ptr(),
);
}
output_block_counter = output_block_counter.wrapping_add(4);
out = &mut out[4 * 2 * OUT_LEN..];
continue;
}
Blake3KernelId::X86Avx2 if blocks_remaining >= 2 => {
unsafe {
super::x86_64::avx2::root_output_blocks2(
chaining_value,
block_words,
output_block_counter,
block_len,
flags,
out.as_mut_ptr(),
);
}
output_block_counter = output_block_counter.wrapping_add(2);
out = &mut out[2 * 2 * OUT_LEN..];
continue;
}
Blake3KernelId::X86Avx2 => {
let block_out = root_output_tail_block_mut(out);
root_output_block_words_inline(
id,
chaining_value,
block_words,
output_block_counter,
block_len,
flags,
block_out,
);
output_block_counter = output_block_counter.wrapping_add(1);
out = &mut out[2 * OUT_LEN..];
continue;
}
Blake3KernelId::X86Sse41 if blocks_remaining >= 4 => {
unsafe {
super::x86_64::sse41::root_output_blocks4(
chaining_value,
block_words,
output_block_counter,
block_len,
flags,
out.as_mut_ptr(),
);
}
output_block_counter = output_block_counter.wrapping_add(4);
out = &mut out[4 * 2 * OUT_LEN..];
continue;
}
Blake3KernelId::X86Sse41 if blocks_remaining >= 2 => {
unsafe {
super::x86_64::sse41::root_output_blocks2(
chaining_value,
block_words,
output_block_counter,
block_len,
flags,
out.as_mut_ptr(),
);
}
output_block_counter = output_block_counter.wrapping_add(2);
out = &mut out[2 * 2 * OUT_LEN..];
continue;
}
Blake3KernelId::X86Sse41 => {
let block_out = root_output_tail_block_mut(out);
root_output_block_words_inline(
id,
chaining_value,
block_words,
output_block_counter,
block_len,
flags,
block_out,
);
output_block_counter = output_block_counter.wrapping_add(1);
out = &mut out[2 * OUT_LEN..];
continue;
}
_ => {}
}
}
#[cfg(target_arch = "aarch64")]
{
if id == Blake3KernelId::Aarch64Neon && blocks_remaining >= 4 {
unsafe {
super::aarch64::root_output_blocks4_neon(
chaining_value,
block_words,
output_block_counter,
block_len,
flags,
out.as_mut_ptr(),
);
}
output_block_counter = output_block_counter.wrapping_add(4);
out = &mut out[4 * 2 * OUT_LEN..];
continue;
}
}
let block_out = unsafe { &mut *out.as_mut_ptr().cast::<[u8; 2 * OUT_LEN]>() };
root_output_block_words_inline(
id,
chaining_value,
block_words,
output_block_counter,
block_len,
flags,
block_out,
);
output_block_counter = output_block_counter.wrapping_add(1);
out = &mut out[2 * OUT_LEN..];
}
}
#[inline(always)]
pub(crate) fn root_output_blocks_from_block_bytes_into_inline(
id: Blake3KernelId,
chaining_value: &[u32; 8],
block_bytes: &[u8; BLOCK_LEN],
output_block_counter: u64,
block_len: u32,
flags: u32,
out: &mut [u8],
) {
debug_assert!(out.len().is_multiple_of(2 * OUT_LEN));
if out.len() == 2 * OUT_LEN {
let block_out: &mut [u8; 2 * OUT_LEN] = unsafe { &mut *out.as_mut_ptr().cast::<[u8; 2 * OUT_LEN]>() };
root_output_block_bytes_inline(
id,
chaining_value,
block_bytes,
output_block_counter,
block_len,
flags,
block_out,
);
return;
}
let block_words = super::words16_from_le_bytes_64(block_bytes);
root_output_blocks_bytes_into_inline(
id,
chaining_value,
&block_words,
output_block_counter,
block_len,
flags,
out,
);
}
#[cfg(target_arch = "x86_64")]
#[inline(always)]
fn parent_block_ptrs<const DEGREE: usize, PtrAt>(
start: usize,
rem: usize,
total: usize,
ptr_at: &mut PtrAt,
) -> [*const u8; DEGREE]
where
PtrAt: FnMut(usize) -> *const u8,
{
debug_assert!(rem != 0);
debug_assert!(rem <= DEGREE);
let last_ptr = ptr_at(total - 1);
let mut ptrs = [last_ptr; DEGREE];
let mut lane = 0usize;
while lane < rem {
ptrs[lane] = ptr_at(start + lane);
lane += 1;
}
ptrs
}
#[cfg(target_arch = "x86_64")]
#[inline(always)]
fn reduce_parent_blocks_lanes<const DEGREE: usize, PtrAt, HashMany, Sink>(
count: usize,
mut ptr_at: PtrAt,
mut hash_many: HashMany,
mut sink: Sink,
) where
PtrAt: FnMut(usize) -> *const u8,
HashMany: FnMut(&[*const u8; DEGREE], usize, &mut [[u8; OUT_LEN]; DEGREE]),
Sink: FnMut(usize, &[u8; OUT_LEN]),
{
let mut tmp = [[0u8; OUT_LEN]; DEGREE];
let mut i = 0usize;
while i < count {
let rem = core::cmp::min(DEGREE, count - i);
let ptrs = parent_block_ptrs::<DEGREE, _>(i, rem, count, &mut ptr_at);
hash_many(&ptrs, rem, &mut tmp);
let mut lane = 0usize;
while lane < rem {
sink(i + lane, &tmp[lane]);
lane += 1;
}
i += rem;
}
}
#[cfg(any(target_endian = "little", feature = "parallel"))]
#[inline]
pub(crate) fn parent_cvs_many_from_cvs_inline(
id: Blake3KernelId,
children: &[[u32; 8]],
key_words: [u32; 8],
flags: u32,
out: &mut [[u32; 8]],
) {
debug_assert_eq!(children.len(), out.len() * 2);
if out.is_empty() {
return;
}
if id == Blake3KernelId::Portable {
for (pair, out_cv) in children.chunks_exact(2).zip(out.iter_mut()) {
*out_cv = parent_cv_inline(id, pair[0], pair[1], key_words, flags);
}
return;
}
#[cfg(target_endian = "little")]
{
let children_bytes: &[[u8; OUT_LEN]] =
unsafe { core::slice::from_raw_parts(children.as_ptr().cast(), children.len()) };
let out_bytes: &mut [[u8; OUT_LEN]] =
unsafe { core::slice::from_raw_parts_mut(out.as_mut_ptr().cast(), out.len()) };
parent_cvs_many_from_bytes_inline(id, children_bytes, key_words, flags, out_bytes);
}
#[cfg(not(target_endian = "little"))]
{
for (pair, out_cv) in children.chunks_exact(2).zip(out.iter_mut()) {
*out_cv = parent_cv_inline(id, pair[0], pair[1], key_words, flags);
}
}
}
#[inline]
pub(crate) fn parent_cvs_many_from_bytes_inline(
id: Blake3KernelId,
children: &[[u8; OUT_LEN]],
key_words: [u32; 8],
flags: u32,
out: &mut [[u8; OUT_LEN]],
) {
debug_assert_eq!(children.len(), out.len() * 2);
if out.is_empty() {
return;
}
match id {
Blake3KernelId::Portable => {}
#[cfg(target_arch = "aarch64")]
Blake3KernelId::Aarch64Neon => {
unsafe { super::aarch64::parent_cvs_many_neon(children, key_words, flags, out) };
return;
}
#[cfg(target_arch = "s390x")]
Blake3KernelId::S390xVector => {
parent_cvs_many4_simd(children, key_words, flags, out);
return;
}
#[cfg(target_arch = "powerpc64")]
Blake3KernelId::PowerVsx => {
parent_cvs_many4_simd(children, key_words, flags, out);
return;
}
#[cfg(target_arch = "riscv64")]
Blake3KernelId::RiscvV => {
parent_cvs_many4_simd(children, key_words, flags, out);
return;
}
#[cfg(target_arch = "x86_64")]
Blake3KernelId::X86Sse41 => {
let parent_flags = PARENT | flags;
reduce_parent_blocks_lanes::<{ super::x86_64::sse41::DEGREE }, _, _, _>(
out.len(),
|idx| children[2 * idx].as_ptr(),
|ptrs, _rem, tmp| {
unsafe {
super::x86_64::sse41::hash4(
ptrs,
1,
&key_words,
0,
false,
parent_flags,
0,
0,
tmp.as_mut_ptr().cast::<u8>(),
);
}
},
|idx, bytes| out[idx] = *bytes,
);
return;
}
#[cfg(target_arch = "x86_64")]
Blake3KernelId::X86Avx512 => {
#[cfg(any(target_os = "linux", target_os = "macos", target_os = "windows"))]
{
let parent_flags = PARENT | flags;
const DEGREE: usize = 16;
debug_assert!(parent_flags <= u8::MAX as u32);
reduce_parent_blocks_lanes::<DEGREE, _, _, _>(
out.len(),
|idx| children[2 * idx].as_ptr(),
|ptrs, rem, tmp| {
unsafe {
super::x86_64::asm::hash_many_avx512(
ptrs.as_ptr(),
rem,
1,
key_words.as_ptr(),
0,
false,
parent_flags as u8,
0,
0,
tmp.as_mut_ptr().cast::<u8>(),
);
}
},
|idx, bytes| out[idx] = *bytes,
);
return;
}
#[cfg(not(any(target_os = "linux", target_os = "macos", target_os = "windows")))]
{
for (pair, out_cv) in children.chunks_exact(2).zip(out.iter_mut()) {
let left = words8_from_le_bytes_32(&pair[0]);
let right = words8_from_le_bytes_32(&pair[1]);
*out_cv = super::words8_to_le_bytes(&parent_cv_inline(
Blake3KernelId::X86Avx512,
left,
right,
key_words,
flags,
));
}
return;
}
}
#[cfg(target_arch = "x86_64")]
Blake3KernelId::X86Avx2 => {
let parent_flags = PARENT | flags;
reduce_parent_blocks_lanes::<{ super::x86_64::avx2::DEGREE }, _, _, _>(
out.len(),
|idx| children[2 * idx].as_ptr(),
|ptrs, _rem, tmp| {
#[cfg(any(target_os = "linux", target_os = "macos", target_os = "windows"))]
{
let rem = _rem;
debug_assert!(parent_flags <= u8::MAX as u32);
unsafe {
super::x86_64::asm::hash_many_avx2(
ptrs.as_ptr(),
rem,
1,
key_words.as_ptr(),
0,
false,
parent_flags as u8,
0,
0,
tmp.as_mut_ptr().cast::<u8>(),
);
}
}
#[cfg(not(any(target_os = "linux", target_os = "macos", target_os = "windows")))]
{
unsafe {
super::x86_64::avx2::hash8(
ptrs,
1,
&key_words,
0,
false,
parent_flags,
0,
0,
tmp.as_mut_ptr().cast::<u8>(),
)
};
}
},
|idx, bytes| out[idx] = *bytes,
);
return;
}
}
for (pair, out_cv) in children.chunks_exact(2).zip(out.iter_mut()) {
let left = words8_from_le_bytes_32(&pair[0]);
let right = words8_from_le_bytes_32(&pair[1]);
*out_cv = super::words8_to_le_bytes(&parent_cv_inline(id, left, right, key_words, flags));
}
}
#[inline]
#[must_use]
pub const fn required_caps(id: Blake3KernelId) -> Caps {
match id {
Blake3KernelId::Portable => Caps::NONE,
#[cfg(target_arch = "x86_64")]
Blake3KernelId::X86Sse41 => x86::SSE41.union(x86::SSSE3),
#[cfg(target_arch = "x86_64")]
Blake3KernelId::X86Avx2 => x86::AVX2.union(x86::SSE41).union(x86::SSSE3),
#[cfg(target_arch = "x86_64")]
Blake3KernelId::X86Avx512 => {
#[cfg(any(target_os = "linux", target_os = "macos", target_os = "windows"))]
{
x86::AVX512F
.union(x86::AVX512VL)
.union(x86::AVX2)
.union(x86::SSE41)
.union(x86::SSSE3)
}
#[cfg(not(any(target_os = "linux", target_os = "macos", target_os = "windows")))]
{
x86::AVX512F
.union(x86::AVX512VL)
.union(x86::AVX512DQ)
.union(x86::AVX2)
.union(x86::SSE41)
.union(x86::SSSE3)
}
}
#[cfg(target_arch = "aarch64")]
Blake3KernelId::Aarch64Neon => aarch64::NEON,
#[cfg(target_arch = "s390x")]
Blake3KernelId::S390xVector => s390x::VECTOR,
#[cfg(target_arch = "powerpc64")]
Blake3KernelId::PowerVsx => power::VSX,
#[cfg(target_arch = "riscv64")]
Blake3KernelId::RiscvV => riscv::V,
}
}
#[inline]
fn chunk_compress_blocks_portable(
chaining_value: &mut [u32; 8],
chunk_counter: u64,
flags: u32,
blocks_compressed: &mut u8,
blocks: &[u8],
) {
debug_assert_eq!(blocks.len() % BLOCK_LEN, 0);
if blocks.len() == BLOCK_LEN {
let block_bytes: &[u8; BLOCK_LEN] = unsafe { &*(blocks.as_ptr().cast()) };
let start = if *blocks_compressed == 0 { CHUNK_START } else { 0 };
let block_words = words16_from_le_bytes_64(block_bytes);
*chaining_value = first_8_words((super::compress)(
chaining_value,
&block_words,
chunk_counter,
BLOCK_LEN as u32,
flags | start,
));
*blocks_compressed = blocks_compressed.wrapping_add(1);
return;
}
let (block_slices, remainder) = blocks.as_chunks::<BLOCK_LEN>();
debug_assert!(remainder.is_empty());
for block_bytes in block_slices {
let start = if *blocks_compressed == 0 { CHUNK_START } else { 0 };
let block_words = words16_from_le_bytes_64(block_bytes);
*chaining_value = first_8_words((super::compress)(
chaining_value,
&block_words,
chunk_counter,
BLOCK_LEN as u32,
flags | start,
));
*blocks_compressed = blocks_compressed.wrapping_add(1);
}
}
#[inline]
fn parent_cv_portable(left_child_cv: [u32; 8], right_child_cv: [u32; 8], key_words: [u32; 8], flags: u32) -> [u32; 8] {
let mut block_words = [0u32; 16];
block_words[..8].copy_from_slice(&left_child_cv);
block_words[8..].copy_from_slice(&right_child_cv);
first_8_words((super::compress)(
&key_words,
&block_words,
0,
BLOCK_LEN as u32,
PARENT | flags,
))
}
unsafe fn hash_many_contiguous_portable(
input: *const u8,
num_chunks: usize,
key: &[u32; 8],
counter: u64,
flags: u32,
out: *mut u8,
) {
debug_assert!(num_chunks != 0);
for chunk_idx in 0..num_chunks {
let chunk_counter = counter.wrapping_add(chunk_idx as u64);
let mut cv = *key;
for block_idx in 0..(CHUNK_LEN / BLOCK_LEN) {
let block_words = unsafe {
let src = input.add(chunk_idx * CHUNK_LEN + block_idx * BLOCK_LEN);
words16_from_le_bytes_64(&*src.cast::<[u8; BLOCK_LEN]>())
};
let start = if block_idx == 0 { CHUNK_START } else { 0 };
let end = if block_idx + 1 == (CHUNK_LEN / BLOCK_LEN) {
super::CHUNK_END
} else {
0
};
let block_flags = flags | start | end;
cv = first_8_words(super::compress(
&cv,
&block_words,
chunk_counter,
BLOCK_LEN as u32,
block_flags,
));
}
for (j, &word) in cv.iter().enumerate() {
let bytes = word.to_le_bytes();
unsafe { core::ptr::copy_nonoverlapping(bytes.as_ptr(), out.add(chunk_idx * OUT_LEN + j * 4), 4) };
}
}
}
#[cfg(any(target_arch = "s390x", target_arch = "powerpc64", target_arch = "riscv64"))]
#[inline(always)]
unsafe fn write_cv_words(out: *mut u8, cv: &[u32; 8]) {
for (j, &word) in cv.iter().enumerate() {
let bytes = word.to_le_bytes();
unsafe { core::ptr::copy_nonoverlapping(bytes.as_ptr(), out.add(j * 4), 4) };
}
}
#[cfg(any(target_arch = "s390x", target_arch = "powerpc64", target_arch = "riscv64"))]
#[inline(always)]
fn rot_lanes_left_1(v: core::simd::u32x4) -> core::simd::u32x4 {
core::simd::simd_swizzle!(v, [1, 2, 3, 0])
}
#[cfg(any(target_arch = "s390x", target_arch = "powerpc64", target_arch = "riscv64"))]
#[inline(always)]
fn rot_lanes_left_2(v: core::simd::u32x4) -> core::simd::u32x4 {
core::simd::simd_swizzle!(v, [2, 3, 0, 1])
}
#[cfg(any(target_arch = "s390x", target_arch = "powerpc64", target_arch = "riscv64"))]
#[inline(always)]
fn rot_lanes_left_3(v: core::simd::u32x4) -> core::simd::u32x4 {
core::simd::simd_swizzle!(v, [3, 0, 1, 2])
}
#[cfg(any(target_arch = "s390x", target_arch = "powerpc64", target_arch = "riscv64"))]
#[inline(always)]
fn rotr32<const N: u32>(v: core::simd::u32x4) -> core::simd::u32x4 {
debug_assert!(N > 0 && N < 32);
let s0 = core::simd::u32x4::splat(N);
let s1 = core::simd::u32x4::splat(32 - N);
(v >> s0) | (v << s1)
}
#[cfg(any(target_arch = "s390x", target_arch = "powerpc64", target_arch = "riscv64"))]
#[inline(always)]
fn load_msg_vec_const<const I0: usize, const I1: usize, const I2: usize, const I3: usize>(
block_words: &[u32; 16],
) -> core::simd::u32x4 {
const {
assert!(I0 < 16);
assert!(I1 < 16);
assert!(I2 < 16);
assert!(I3 < 16);
}
core::simd::u32x4::from_array([block_words[I0], block_words[I1], block_words[I2], block_words[I3]])
}
#[cfg(any(target_arch = "s390x", target_arch = "powerpc64", target_arch = "riscv64"))]
const MSG_SCHEDULE_SIMD: [[usize; 16]; 7] = [
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
[2, 6, 3, 10, 7, 0, 4, 13, 1, 11, 12, 5, 9, 14, 15, 8],
[3, 4, 10, 12, 13, 2, 7, 14, 6, 5, 9, 0, 11, 15, 8, 1],
[10, 7, 12, 9, 14, 3, 13, 15, 4, 0, 11, 2, 5, 8, 1, 6],
[12, 13, 9, 11, 15, 10, 14, 8, 7, 2, 5, 3, 0, 1, 6, 4],
[9, 14, 11, 5, 8, 12, 15, 1, 13, 3, 0, 10, 2, 6, 4, 7],
[11, 15, 5, 0, 1, 9, 8, 6, 14, 10, 2, 12, 3, 4, 7, 13],
];
#[cfg(any(target_arch = "s390x", target_arch = "powerpc64", target_arch = "riscv64"))]
#[inline(always)]
fn compress_simd_leaf(
chaining_value: &[u32; 8],
block_words: &[u32; 16],
counter: u64,
block_len: u32,
flags: u32,
) -> [u32; 16] {
type Vec4 = core::simd::u32x4;
let mut row0 = Vec4::from_array([
chaining_value[0],
chaining_value[1],
chaining_value[2],
chaining_value[3],
]);
let mut row1 = Vec4::from_array([
chaining_value[4],
chaining_value[5],
chaining_value[6],
chaining_value[7],
]);
let mut row2 = Vec4::from_array([super::IV[0], super::IV[1], super::IV[2], super::IV[3]]);
let mut row3 = Vec4::from_array([counter as u32, (counter >> 32) as u32, block_len, flags]);
macro_rules! g {
($mx:expr, $my:expr) => {{
row0 += row1;
row0 += $mx;
row3 ^= row0;
row3 = rotr32::<16>(row3);
row2 += row3;
row1 ^= row2;
row1 = rotr32::<12>(row1);
row0 += row1;
row0 += $my;
row3 ^= row0;
row3 = rotr32::<8>(row3);
row2 += row3;
row1 ^= row2;
row1 = rotr32::<7>(row1);
}};
}
macro_rules! round {
(
$c0:literal, $c1:literal, $c2:literal, $c3:literal, $c4:literal, $c5:literal, $c6:literal, $c7:literal,
$d0:literal, $d1:literal, $d2:literal, $d3:literal, $d4:literal, $d5:literal, $d6:literal, $d7:literal
) => {{
let mx0 = load_msg_vec_const::<$c0, $c2, $c4, $c6>(block_words);
let my0 = load_msg_vec_const::<$c1, $c3, $c5, $c7>(block_words);
let mx1 = load_msg_vec_const::<$d0, $d2, $d4, $d6>(block_words);
let my1 = load_msg_vec_const::<$d1, $d3, $d5, $d7>(block_words);
g!(mx0, my0);
row1 = rot_lanes_left_1(row1);
row2 = rot_lanes_left_2(row2);
row3 = rot_lanes_left_3(row3);
g!(mx1, my1);
row1 = rot_lanes_left_3(row1);
row2 = rot_lanes_left_2(row2);
row3 = rot_lanes_left_1(row3);
}};
}
round!(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
round!(2, 6, 3, 10, 7, 0, 4, 13, 1, 11, 12, 5, 9, 14, 15, 8);
round!(3, 4, 10, 12, 13, 2, 7, 14, 6, 5, 9, 0, 11, 15, 8, 1);
round!(10, 7, 12, 9, 14, 3, 13, 15, 4, 0, 11, 2, 5, 8, 1, 6);
round!(12, 13, 9, 11, 15, 10, 14, 8, 7, 2, 5, 3, 0, 1, 6, 4);
round!(9, 14, 11, 5, 8, 12, 15, 1, 13, 3, 0, 10, 2, 6, 4, 7);
round!(11, 15, 5, 0, 1, 9, 8, 6, 14, 10, 2, 12, 3, 4, 7, 13);
let cv_lo = Vec4::from_array([
chaining_value[0],
chaining_value[1],
chaining_value[2],
chaining_value[3],
]);
let cv_hi = Vec4::from_array([
chaining_value[4],
chaining_value[5],
chaining_value[6],
chaining_value[7],
]);
row0 ^= row2;
row1 ^= row3;
row2 ^= cv_lo;
row3 ^= cv_hi;
let mut out = [0u32; 16];
out[..4].copy_from_slice(&row0.to_array());
out[4..8].copy_from_slice(&row1.to_array());
out[8..12].copy_from_slice(&row2.to_array());
out[12..16].copy_from_slice(&row3.to_array());
out
}
#[cfg(any(target_arch = "s390x", target_arch = "powerpc64", target_arch = "riscv64"))]
#[inline(always)]
fn load_msg_lanes4_contiguous(base: *const u8, block_offset: usize) -> [core::simd::u32x4; 16] {
let b0 = unsafe { words16_from_le_bytes_64(&*base.add(block_offset).cast::<[u8; BLOCK_LEN]>()) };
let b1 = unsafe { words16_from_le_bytes_64(&*base.add(CHUNK_LEN + block_offset).cast::<[u8; BLOCK_LEN]>()) };
let b2 = unsafe { words16_from_le_bytes_64(&*base.add(2 * CHUNK_LEN + block_offset).cast::<[u8; BLOCK_LEN]>()) };
let b3 = unsafe { words16_from_le_bytes_64(&*base.add(3 * CHUNK_LEN + block_offset).cast::<[u8; BLOCK_LEN]>()) };
let mut out = [core::simd::u32x4::splat(0); 16];
let mut i = 0usize;
while i < 16 {
out[i] = core::simd::u32x4::from_array([b0[i], b1[i], b2[i], b3[i]]);
i += 1;
}
out
}
#[cfg(any(target_arch = "s390x", target_arch = "powerpc64", target_arch = "riscv64"))]
#[inline(always)]
fn g4_simd(
v: &mut [core::simd::u32x4; 16],
a: usize,
b: usize,
c: usize,
d: usize,
mx: core::simd::u32x4,
my: core::simd::u32x4,
) {
v[a] += v[b];
v[a] += mx;
v[d] ^= v[a];
v[d] = rotr32::<16>(v[d]);
v[c] += v[d];
v[b] ^= v[c];
v[b] = rotr32::<12>(v[b]);
v[a] += v[b];
v[a] += my;
v[d] ^= v[a];
v[d] = rotr32::<8>(v[d]);
v[c] += v[d];
v[b] ^= v[c];
v[b] = rotr32::<7>(v[b]);
}
#[cfg(any(target_arch = "s390x", target_arch = "powerpc64", target_arch = "riscv64"))]
#[inline(always)]
fn round4_simd(v: &mut [core::simd::u32x4; 16], m: &[core::simd::u32x4; 16], r: usize) {
let s = &MSG_SCHEDULE_SIMD[r];
g4_simd(v, 0, 4, 8, 12, m[s[0]], m[s[1]]);
g4_simd(v, 1, 5, 9, 13, m[s[2]], m[s[3]]);
g4_simd(v, 2, 6, 10, 14, m[s[4]], m[s[5]]);
g4_simd(v, 3, 7, 11, 15, m[s[6]], m[s[7]]);
g4_simd(v, 0, 5, 10, 15, m[s[8]], m[s[9]]);
g4_simd(v, 1, 6, 11, 12, m[s[10]], m[s[11]]);
g4_simd(v, 2, 7, 8, 13, m[s[12]], m[s[13]]);
g4_simd(v, 3, 4, 9, 14, m[s[14]], m[s[15]]);
}
#[cfg(any(target_arch = "s390x", target_arch = "powerpc64", target_arch = "riscv64"))]
#[inline(always)]
fn load_parent_msg_lanes4(children: &[[u8; OUT_LEN]], base: usize, rem: usize) -> [core::simd::u32x4; 16] {
type Vec4 = core::simd::u32x4;
debug_assert!(rem > 0 && rem <= 4);
let last = base + rem - 1;
let idx0 = base;
let idx1 = if rem > 1 { base + 1 } else { last };
let idx2 = if rem > 2 { base + 2 } else { last };
let idx3 = if rem > 3 { base + 3 } else { last };
let l0 = words8_from_le_bytes_32(&children[2 * idx0]);
let r0 = words8_from_le_bytes_32(&children[2 * idx0 + 1]);
let l1 = words8_from_le_bytes_32(&children[2 * idx1]);
let r1 = words8_from_le_bytes_32(&children[2 * idx1 + 1]);
let l2 = words8_from_le_bytes_32(&children[2 * idx2]);
let r2 = words8_from_le_bytes_32(&children[2 * idx2 + 1]);
let l3 = words8_from_le_bytes_32(&children[2 * idx3]);
let r3 = words8_from_le_bytes_32(&children[2 * idx3 + 1]);
let mut msg = [Vec4::splat(0); 16];
let mut w = 0usize;
while w < 8 {
msg[w] = Vec4::from_array([l0[w], l1[w], l2[w], l3[w]]);
msg[w + 8] = Vec4::from_array([r0[w], r1[w], r2[w], r3[w]]);
w += 1;
}
msg
}
#[cfg(any(target_arch = "s390x", target_arch = "powerpc64", target_arch = "riscv64"))]
#[inline(always)]
fn parent_cvs_many4_simd(children: &[[u8; OUT_LEN]], key_words: [u32; 8], flags: u32, out: &mut [[u8; OUT_LEN]]) {
type Vec4 = core::simd::u32x4;
debug_assert_eq!(children.len(), out.len() * 2);
if out.is_empty() {
return;
}
let parent_flags = PARENT | flags;
let mut i = 0usize;
while i < out.len() {
let rem = core::cmp::min(4, out.len() - i);
let msg = load_parent_msg_lanes4(children, i, rem);
let mut v = [
Vec4::splat(key_words[0]),
Vec4::splat(key_words[1]),
Vec4::splat(key_words[2]),
Vec4::splat(key_words[3]),
Vec4::splat(key_words[4]),
Vec4::splat(key_words[5]),
Vec4::splat(key_words[6]),
Vec4::splat(key_words[7]),
Vec4::splat(super::IV[0]),
Vec4::splat(super::IV[1]),
Vec4::splat(super::IV[2]),
Vec4::splat(super::IV[3]),
Vec4::splat(0),
Vec4::splat(0),
Vec4::splat(BLOCK_LEN as u32),
Vec4::splat(parent_flags),
];
round4_simd(&mut v, &msg, 0);
round4_simd(&mut v, &msg, 1);
round4_simd(&mut v, &msg, 2);
round4_simd(&mut v, &msg, 3);
round4_simd(&mut v, &msg, 4);
round4_simd(&mut v, &msg, 5);
round4_simd(&mut v, &msg, 6);
let mut x = [Vec4::splat(0); 8];
let mut j = 0usize;
while j < 8 {
x[j] = v[j] ^ v[j + 8];
j += 1;
}
let xa0 = x[0].to_array();
let xa1 = x[1].to_array();
let xa2 = x[2].to_array();
let xa3 = x[3].to_array();
let xa4 = x[4].to_array();
let xa5 = x[5].to_array();
let xa6 = x[6].to_array();
let xa7 = x[7].to_array();
let mut lane = 0usize;
while lane < rem {
out[i + lane] = super::words8_to_le_bytes(&[
xa0[lane], xa1[lane], xa2[lane], xa3[lane], xa4[lane], xa5[lane], xa6[lane], xa7[lane],
]);
lane += 1;
}
i += rem;
}
}
#[cfg(any(target_arch = "s390x", target_arch = "powerpc64", target_arch = "riscv64"))]
#[inline(always)]
unsafe fn hash4_contiguous_full_chunks_simd(input: *const u8, key: &[u32; 8], counter: u64, flags: u32, out: *mut u8) {
type Vec4 = core::simd::u32x4;
let mut h = [
Vec4::splat(key[0]),
Vec4::splat(key[1]),
Vec4::splat(key[2]),
Vec4::splat(key[3]),
Vec4::splat(key[4]),
Vec4::splat(key[5]),
Vec4::splat(key[6]),
Vec4::splat(key[7]),
];
let counter_low = Vec4::from_array([
counter as u32,
counter.wrapping_add(1) as u32,
counter.wrapping_add(2) as u32,
counter.wrapping_add(3) as u32,
]);
let counter_high = Vec4::from_array([
(counter >> 32) as u32,
(counter.wrapping_add(1) >> 32) as u32,
(counter.wrapping_add(2) >> 32) as u32,
(counter.wrapping_add(3) >> 32) as u32,
]);
let mut block_idx = 0usize;
while block_idx < (CHUNK_LEN / BLOCK_LEN) {
let msg = load_msg_lanes4_contiguous(input, block_idx * BLOCK_LEN);
let block_flags = flags
| if block_idx == 0 { CHUNK_START } else { 0 }
| if block_idx + 1 == (CHUNK_LEN / BLOCK_LEN) {
super::CHUNK_END
} else {
0
};
let mut v = [
h[0],
h[1],
h[2],
h[3],
h[4],
h[5],
h[6],
h[7],
Vec4::splat(super::IV[0]),
Vec4::splat(super::IV[1]),
Vec4::splat(super::IV[2]),
Vec4::splat(super::IV[3]),
counter_low,
counter_high,
Vec4::splat(BLOCK_LEN as u32),
Vec4::splat(block_flags),
];
round4_simd(&mut v, &msg, 0);
round4_simd(&mut v, &msg, 1);
round4_simd(&mut v, &msg, 2);
round4_simd(&mut v, &msg, 3);
round4_simd(&mut v, &msg, 4);
round4_simd(&mut v, &msg, 5);
round4_simd(&mut v, &msg, 6);
let mut i = 0usize;
while i < 8 {
h[i] = v[i] ^ v[i + 8];
i += 1;
}
block_idx += 1;
}
let h0 = h[0].to_array();
let h1 = h[1].to_array();
let h2 = h[2].to_array();
let h3 = h[3].to_array();
let h4 = h[4].to_array();
let h5 = h[5].to_array();
let h6 = h[6].to_array();
let h7 = h[7].to_array();
let mut lane = 0usize;
while lane < 4 {
let dst = unsafe { out.add(lane * OUT_LEN) };
let words = [
h0[lane], h1[lane], h2[lane], h3[lane], h4[lane], h5[lane], h6[lane], h7[lane],
];
let mut word = 0usize;
while word < 8 {
let bytes = words[word].to_le_bytes();
unsafe { core::ptr::copy_nonoverlapping(bytes.as_ptr(), dst.add(word * 4), 4) };
word += 1;
}
lane += 1;
}
}
#[cfg(target_arch = "s390x")]
#[target_feature(enable = "vector")]
unsafe fn compress_s390x_vector(
chaining_value: &[u32; 8],
block_words: &[u32; 16],
counter: u64,
block_len: u32,
flags: u32,
) -> [u32; 16] {
compress_simd_leaf(chaining_value, block_words, counter, block_len, flags)
}
#[cfg(target_arch = "s390x")]
fn compress_s390x_vector_wrapper(
chaining_value: &[u32; 8],
block_words: &[u32; 16],
counter: u64,
block_len: u32,
flags: u32,
) -> [u32; 16] {
unsafe { compress_s390x_vector(chaining_value, block_words, counter, block_len, flags) }
}
#[cfg(target_arch = "s390x")]
#[target_feature(enable = "vector")]
unsafe fn root_output_blocks1_s390x_vector(
chaining_value: &[u32; 8],
block_words: &[u32; 16],
counter: u64,
block_len: u32,
flags: u32,
out: *mut u8,
) {
let words = unsafe { compress_s390x_vector(chaining_value, block_words, counter, block_len, flags) };
let out = unsafe { &mut *out.cast::<[u8; 2 * OUT_LEN]>() };
write_root_output_words(out, &words);
}
#[cfg(target_arch = "s390x")]
#[target_feature(enable = "vector")]
unsafe fn chunk_compress_blocks_s390x_vector(
chaining_value: &mut [u32; 8],
chunk_counter: u64,
flags: u32,
blocks_compressed: &mut u8,
blocks: &[u8],
) {
debug_assert_eq!(blocks.len() % BLOCK_LEN, 0);
const SHORT_BATCH_PORTABLE_MAX_BLOCKS: usize = (CHUNK_LEN / BLOCK_LEN) - 1; let num_blocks = blocks.len() / BLOCK_LEN;
if num_blocks <= SHORT_BATCH_PORTABLE_MAX_BLOCKS {
chunk_compress_blocks_portable(chaining_value, chunk_counter, flags, blocks_compressed, blocks);
return;
}
if blocks.len() == BLOCK_LEN {
let block_bytes: &[u8; BLOCK_LEN] = unsafe { &*(blocks.as_ptr().cast()) };
let start = if *blocks_compressed == 0 { CHUNK_START } else { 0 };
let block_words = words16_from_le_bytes_64(block_bytes);
*chaining_value = first_8_words(unsafe {
compress_s390x_vector(
chaining_value,
&block_words,
chunk_counter,
BLOCK_LEN as u32,
flags | start,
)
});
*blocks_compressed = blocks_compressed.wrapping_add(1);
return;
}
let (block_slices, remainder) = blocks.as_chunks::<BLOCK_LEN>();
debug_assert!(remainder.is_empty());
for block_bytes in block_slices {
let start = if *blocks_compressed == 0 { CHUNK_START } else { 0 };
let block_words = words16_from_le_bytes_64(block_bytes);
*chaining_value = first_8_words(unsafe {
compress_s390x_vector(
chaining_value,
&block_words,
chunk_counter,
BLOCK_LEN as u32,
flags | start,
)
});
*blocks_compressed = blocks_compressed.wrapping_add(1);
}
}
#[cfg(target_arch = "s390x")]
fn chunk_compress_blocks_s390x_vector_wrapper(
chaining_value: &mut [u32; 8],
chunk_counter: u64,
flags: u32,
blocks_compressed: &mut u8,
blocks: &[u8],
) {
unsafe { chunk_compress_blocks_s390x_vector(chaining_value, chunk_counter, flags, blocks_compressed, blocks) }
}
#[cfg(target_arch = "s390x")]
#[target_feature(enable = "vector")]
unsafe fn parent_cv_s390x_vector(
left_child_cv: [u32; 8],
right_child_cv: [u32; 8],
key_words: [u32; 8],
flags: u32,
) -> [u32; 8] {
let mut block_words = [0u32; 16];
block_words[..8].copy_from_slice(&left_child_cv);
block_words[8..].copy_from_slice(&right_child_cv);
first_8_words(compress_simd_leaf(
&key_words,
&block_words,
0,
BLOCK_LEN as u32,
PARENT | flags,
))
}
#[cfg(target_arch = "s390x")]
fn parent_cv_s390x_vector_wrapper(
left_child_cv: [u32; 8],
right_child_cv: [u32; 8],
key_words: [u32; 8],
flags: u32,
) -> [u32; 8] {
unsafe { parent_cv_s390x_vector(left_child_cv, right_child_cv, key_words, flags) }
}
#[cfg(target_arch = "s390x")]
#[target_feature(enable = "vector")]
unsafe fn hash_one_chunk_s390x_vector(input: *const u8, key: &[u32; 8], counter: u64, flags: u32, out: *mut u8) {
let mut cv = *key;
let mut blocks_compressed = 0u8;
let body_len = CHUNK_LEN - BLOCK_LEN;
let body = unsafe { core::slice::from_raw_parts(input, body_len) };
unsafe { chunk_compress_blocks_s390x_vector(&mut cv, counter, flags, &mut blocks_compressed, body) };
let tail_words = unsafe {
let tail_ptr = input.add(body_len);
words16_from_le_bytes_64(&*tail_ptr.cast::<[u8; BLOCK_LEN]>())
};
let start = if blocks_compressed == 0 { CHUNK_START } else { 0 };
let tail_flags = flags | start | super::CHUNK_END;
cv = first_8_words(unsafe { compress_s390x_vector(&cv, &tail_words, counter, BLOCK_LEN as u32, tail_flags) });
unsafe { write_cv_words(out, &cv) };
}
#[cfg(target_arch = "s390x")]
#[target_feature(enable = "vector")]
unsafe fn hash_many_contiguous_s390x_vector(
input: *const u8,
num_chunks: usize,
key: &[u32; 8],
counter: u64,
flags: u32,
out: *mut u8,
) {
debug_assert!(num_chunks != 0);
let mut idx = 0usize;
while idx + 4 <= num_chunks {
unsafe {
hash4_contiguous_full_chunks_simd(
input.add(idx * CHUNK_LEN),
key,
counter.wrapping_add(idx as u64),
flags,
out.add(idx * OUT_LEN),
);
}
idx += 4;
}
while idx < num_chunks {
unsafe {
hash_one_chunk_s390x_vector(
input.add(idx * CHUNK_LEN),
key,
counter.wrapping_add(idx as u64),
flags,
out.add(idx * OUT_LEN),
);
}
idx += 1;
}
}
#[cfg(target_arch = "s390x")]
unsafe fn hash_many_contiguous_s390x_vector_wrapper(
input: *const u8,
num_chunks: usize,
key: &[u32; 8],
counter: u64,
flags: u32,
out: *mut u8,
) {
unsafe { hash_many_contiguous_s390x_vector(input, num_chunks, key, counter, flags, out) }
}
#[cfg(target_arch = "powerpc64")]
#[target_feature(enable = "vsx")]
unsafe fn compress_power_vsx(
chaining_value: &[u32; 8],
block_words: &[u32; 16],
counter: u64,
block_len: u32,
flags: u32,
) -> [u32; 16] {
compress_simd_leaf(chaining_value, block_words, counter, block_len, flags)
}
#[cfg(target_arch = "powerpc64")]
fn compress_power_vsx_wrapper(
chaining_value: &[u32; 8],
block_words: &[u32; 16],
counter: u64,
block_len: u32,
flags: u32,
) -> [u32; 16] {
unsafe { compress_power_vsx(chaining_value, block_words, counter, block_len, flags) }
}
#[cfg(target_arch = "powerpc64")]
#[target_feature(enable = "vsx")]
unsafe fn root_output_blocks1_power_vsx(
chaining_value: &[u32; 8],
block_words: &[u32; 16],
counter: u64,
block_len: u32,
flags: u32,
out: *mut u8,
) {
let words = unsafe { compress_power_vsx(chaining_value, block_words, counter, block_len, flags) };
let out = unsafe { &mut *out.cast::<[u8; 2 * OUT_LEN]>() };
write_root_output_words(out, &words);
}
#[cfg(target_arch = "powerpc64")]
#[target_feature(enable = "vsx")]
unsafe fn chunk_compress_blocks_power_vsx(
chaining_value: &mut [u32; 8],
chunk_counter: u64,
flags: u32,
blocks_compressed: &mut u8,
blocks: &[u8],
) {
debug_assert_eq!(blocks.len() % BLOCK_LEN, 0);
if blocks.len() == BLOCK_LEN {
let block_bytes: &[u8; BLOCK_LEN] = unsafe { &*(blocks.as_ptr().cast()) };
let start = if *blocks_compressed == 0 { CHUNK_START } else { 0 };
let block_words = words16_from_le_bytes_64(block_bytes);
*chaining_value = first_8_words(unsafe {
compress_power_vsx(
chaining_value,
&block_words,
chunk_counter,
BLOCK_LEN as u32,
flags | start,
)
});
*blocks_compressed = blocks_compressed.wrapping_add(1);
return;
}
let (block_slices, remainder) = blocks.as_chunks::<BLOCK_LEN>();
debug_assert!(remainder.is_empty());
for block_bytes in block_slices {
let start = if *blocks_compressed == 0 { CHUNK_START } else { 0 };
let block_words = words16_from_le_bytes_64(block_bytes);
*chaining_value = first_8_words(unsafe {
compress_power_vsx(
chaining_value,
&block_words,
chunk_counter,
BLOCK_LEN as u32,
flags | start,
)
});
*blocks_compressed = blocks_compressed.wrapping_add(1);
}
}
#[cfg(target_arch = "powerpc64")]
fn chunk_compress_blocks_power_vsx_wrapper(
chaining_value: &mut [u32; 8],
chunk_counter: u64,
flags: u32,
blocks_compressed: &mut u8,
blocks: &[u8],
) {
unsafe { chunk_compress_blocks_power_vsx(chaining_value, chunk_counter, flags, blocks_compressed, blocks) }
}
#[cfg(target_arch = "powerpc64")]
#[target_feature(enable = "vsx")]
unsafe fn parent_cv_power_vsx(
left_child_cv: [u32; 8],
right_child_cv: [u32; 8],
key_words: [u32; 8],
flags: u32,
) -> [u32; 8] {
let mut block_words = [0u32; 16];
block_words[..8].copy_from_slice(&left_child_cv);
block_words[8..].copy_from_slice(&right_child_cv);
first_8_words(compress_simd_leaf(
&key_words,
&block_words,
0,
BLOCK_LEN as u32,
PARENT | flags,
))
}
#[cfg(target_arch = "powerpc64")]
fn parent_cv_power_vsx_wrapper(
left_child_cv: [u32; 8],
right_child_cv: [u32; 8],
key_words: [u32; 8],
flags: u32,
) -> [u32; 8] {
unsafe { parent_cv_power_vsx(left_child_cv, right_child_cv, key_words, flags) }
}
#[cfg(target_arch = "powerpc64")]
#[target_feature(enable = "vsx")]
unsafe fn hash_one_chunk_power_vsx(input: *const u8, key: &[u32; 8], counter: u64, flags: u32, out: *mut u8) {
let mut cv = *key;
let mut blocks_compressed = 0u8;
let body_len = CHUNK_LEN - BLOCK_LEN;
let body = unsafe { core::slice::from_raw_parts(input, body_len) };
unsafe { chunk_compress_blocks_power_vsx(&mut cv, counter, flags, &mut blocks_compressed, body) };
let tail_words = unsafe {
let tail_ptr = input.add(body_len);
words16_from_le_bytes_64(&*tail_ptr.cast::<[u8; BLOCK_LEN]>())
};
let start = if blocks_compressed == 0 { CHUNK_START } else { 0 };
let tail_flags = flags | start | super::CHUNK_END;
cv = first_8_words(unsafe { compress_power_vsx(&cv, &tail_words, counter, BLOCK_LEN as u32, tail_flags) });
unsafe { write_cv_words(out, &cv) };
}
#[cfg(target_arch = "powerpc64")]
#[target_feature(enable = "vsx")]
unsafe fn hash_many_contiguous_power_vsx(
input: *const u8,
num_chunks: usize,
key: &[u32; 8],
counter: u64,
flags: u32,
out: *mut u8,
) {
debug_assert!(num_chunks != 0);
let mut idx = 0usize;
while idx + 4 <= num_chunks {
unsafe {
hash4_contiguous_full_chunks_simd(
input.add(idx * CHUNK_LEN),
key,
counter.wrapping_add(idx as u64),
flags,
out.add(idx * OUT_LEN),
);
}
idx += 4;
}
while idx < num_chunks {
unsafe {
hash_one_chunk_power_vsx(
input.add(idx * CHUNK_LEN),
key,
counter.wrapping_add(idx as u64),
flags,
out.add(idx * OUT_LEN),
);
}
idx += 1;
}
}
#[cfg(target_arch = "powerpc64")]
unsafe fn hash_many_contiguous_power_vsx_wrapper(
input: *const u8,
num_chunks: usize,
key: &[u32; 8],
counter: u64,
flags: u32,
out: *mut u8,
) {
unsafe { hash_many_contiguous_power_vsx(input, num_chunks, key, counter, flags, out) }
}
#[cfg(target_arch = "riscv64")]
#[inline(always)]
fn compress_simd_leaf_riscv(
chaining_value: &[u32; 8],
block_words: &[u32; 16],
counter: u64,
block_len: u32,
flags: u32,
) -> [u32; 16] {
type Vec4 = core::simd::u32x4;
let mut msg = [Vec4::splat(0); 16];
let mut w = 0usize;
while w < 16 {
msg[w] = Vec4::splat(block_words[w]);
w += 1;
}
let mut v = [
Vec4::splat(chaining_value[0]),
Vec4::splat(chaining_value[1]),
Vec4::splat(chaining_value[2]),
Vec4::splat(chaining_value[3]),
Vec4::splat(chaining_value[4]),
Vec4::splat(chaining_value[5]),
Vec4::splat(chaining_value[6]),
Vec4::splat(chaining_value[7]),
Vec4::splat(super::IV[0]),
Vec4::splat(super::IV[1]),
Vec4::splat(super::IV[2]),
Vec4::splat(super::IV[3]),
Vec4::splat(counter as u32),
Vec4::splat((counter >> 32) as u32),
Vec4::splat(block_len),
Vec4::splat(flags),
];
round4_simd(&mut v, &msg, 0);
round4_simd(&mut v, &msg, 1);
round4_simd(&mut v, &msg, 2);
round4_simd(&mut v, &msg, 3);
round4_simd(&mut v, &msg, 4);
round4_simd(&mut v, &msg, 5);
round4_simd(&mut v, &msg, 6);
let x0 = (v[0] ^ v[8]).to_array();
let x1 = (v[1] ^ v[9]).to_array();
let x2 = (v[2] ^ v[10]).to_array();
let x3 = (v[3] ^ v[11]).to_array();
let x4 = (v[4] ^ v[12]).to_array();
let x5 = (v[5] ^ v[13]).to_array();
let x6 = (v[6] ^ v[14]).to_array();
let x7 = (v[7] ^ v[15]).to_array();
let y0 = (v[8] ^ Vec4::splat(chaining_value[0])).to_array();
let y1 = (v[9] ^ Vec4::splat(chaining_value[1])).to_array();
let y2 = (v[10] ^ Vec4::splat(chaining_value[2])).to_array();
let y3 = (v[11] ^ Vec4::splat(chaining_value[3])).to_array();
let y4 = (v[12] ^ Vec4::splat(chaining_value[4])).to_array();
let y5 = (v[13] ^ Vec4::splat(chaining_value[5])).to_array();
let y6 = (v[14] ^ Vec4::splat(chaining_value[6])).to_array();
let y7 = (v[15] ^ Vec4::splat(chaining_value[7])).to_array();
[
x0[0], x1[0], x2[0], x3[0], x4[0], x5[0], x6[0], x7[0], y0[0], y1[0], y2[0], y3[0], y4[0], y5[0], y6[0], y7[0],
]
}
#[cfg(target_arch = "riscv64")]
#[target_feature(enable = "v")]
unsafe fn compress_riscv_v(
chaining_value: &[u32; 8],
block_words: &[u32; 16],
counter: u64,
block_len: u32,
flags: u32,
) -> [u32; 16] {
compress_simd_leaf_riscv(chaining_value, block_words, counter, block_len, flags)
}
#[cfg(target_arch = "riscv64")]
fn compress_riscv_v_wrapper(
chaining_value: &[u32; 8],
block_words: &[u32; 16],
counter: u64,
block_len: u32,
flags: u32,
) -> [u32; 16] {
unsafe { compress_riscv_v(chaining_value, block_words, counter, block_len, flags) }
}
#[cfg(target_arch = "riscv64")]
#[target_feature(enable = "v")]
unsafe fn root_output_blocks1_riscv_v(
chaining_value: &[u32; 8],
block_words: &[u32; 16],
counter: u64,
block_len: u32,
flags: u32,
out: *mut u8,
) {
let words = unsafe { compress_riscv_v(chaining_value, block_words, counter, block_len, flags) };
let out = unsafe { &mut *out.cast::<[u8; 2 * OUT_LEN]>() };
write_root_output_words(out, &words);
}
#[cfg(target_arch = "riscv64")]
#[target_feature(enable = "v")]
unsafe fn chunk_compress_blocks_riscv_v(
chaining_value: &mut [u32; 8],
chunk_counter: u64,
flags: u32,
blocks_compressed: &mut u8,
blocks: &[u8],
) {
debug_assert_eq!(blocks.len() % BLOCK_LEN, 0);
if blocks.len() == BLOCK_LEN {
let block_bytes: &[u8; BLOCK_LEN] = unsafe { &*(blocks.as_ptr().cast()) };
let start = if *blocks_compressed == 0 { CHUNK_START } else { 0 };
let block_words = words16_from_le_bytes_64(block_bytes);
*chaining_value = first_8_words(unsafe {
compress_riscv_v(
chaining_value,
&block_words,
chunk_counter,
BLOCK_LEN as u32,
flags | start,
)
});
*blocks_compressed = blocks_compressed.wrapping_add(1);
return;
}
let (block_slices, remainder) = blocks.as_chunks::<BLOCK_LEN>();
debug_assert!(remainder.is_empty());
for block_bytes in block_slices {
let start = if *blocks_compressed == 0 { CHUNK_START } else { 0 };
let block_words = words16_from_le_bytes_64(block_bytes);
*chaining_value = first_8_words(unsafe {
compress_riscv_v(
chaining_value,
&block_words,
chunk_counter,
BLOCK_LEN as u32,
flags | start,
)
});
*blocks_compressed = blocks_compressed.wrapping_add(1);
}
}
#[cfg(target_arch = "riscv64")]
fn chunk_compress_blocks_riscv_v_wrapper(
chaining_value: &mut [u32; 8],
chunk_counter: u64,
flags: u32,
blocks_compressed: &mut u8,
blocks: &[u8],
) {
unsafe { chunk_compress_blocks_riscv_v(chaining_value, chunk_counter, flags, blocks_compressed, blocks) }
}
#[cfg(target_arch = "riscv64")]
#[target_feature(enable = "v")]
unsafe fn parent_cv_riscv_v(
left_child_cv: [u32; 8],
right_child_cv: [u32; 8],
key_words: [u32; 8],
flags: u32,
) -> [u32; 8] {
let mut block_words = [0u32; 16];
block_words[..8].copy_from_slice(&left_child_cv);
block_words[8..].copy_from_slice(&right_child_cv);
first_8_words(compress_simd_leaf(
&key_words,
&block_words,
0,
BLOCK_LEN as u32,
PARENT | flags,
))
}
#[cfg(target_arch = "riscv64")]
fn parent_cv_riscv_v_wrapper(
left_child_cv: [u32; 8],
right_child_cv: [u32; 8],
key_words: [u32; 8],
flags: u32,
) -> [u32; 8] {
unsafe { parent_cv_riscv_v(left_child_cv, right_child_cv, key_words, flags) }
}
#[cfg(target_arch = "riscv64")]
#[target_feature(enable = "v")]
unsafe fn hash_one_chunk_riscv_v(input: *const u8, key: &[u32; 8], counter: u64, flags: u32, out: *mut u8) {
let mut cv = *key;
let mut blocks_compressed = 0u8;
let body_len = CHUNK_LEN - BLOCK_LEN;
let body = unsafe { core::slice::from_raw_parts(input, body_len) };
unsafe { chunk_compress_blocks_riscv_v(&mut cv, counter, flags, &mut blocks_compressed, body) };
let tail_words = unsafe {
let tail_ptr = input.add(body_len);
words16_from_le_bytes_64(&*tail_ptr.cast::<[u8; BLOCK_LEN]>())
};
let start = if blocks_compressed == 0 { CHUNK_START } else { 0 };
let tail_flags = flags | start | super::CHUNK_END;
cv = first_8_words(unsafe { compress_riscv_v(&cv, &tail_words, counter, BLOCK_LEN as u32, tail_flags) });
unsafe { write_cv_words(out, &cv) };
}
#[cfg(target_arch = "riscv64")]
#[target_feature(enable = "v")]
unsafe fn hash_many_contiguous_riscv_v(
input: *const u8,
num_chunks: usize,
key: &[u32; 8],
counter: u64,
flags: u32,
out: *mut u8,
) {
debug_assert!(num_chunks != 0);
let mut idx = 0usize;
while idx + 4 <= num_chunks {
unsafe {
hash4_contiguous_full_chunks_simd(
input.add(idx * CHUNK_LEN),
key,
counter.wrapping_add(idx as u64),
flags,
out.add(idx * OUT_LEN),
);
}
idx += 4;
}
while idx < num_chunks {
unsafe {
hash_one_chunk_riscv_v(
input.add(idx * CHUNK_LEN),
key,
counter.wrapping_add(idx as u64),
flags,
out.add(idx * OUT_LEN),
);
}
idx += 1;
}
}
#[cfg(target_arch = "riscv64")]
unsafe fn hash_many_contiguous_riscv_v_wrapper(
input: *const u8,
num_chunks: usize,
key: &[u32; 8],
counter: u64,
flags: u32,
out: *mut u8,
) {
unsafe { hash_many_contiguous_riscv_v(input, num_chunks, key, counter, flags, out) }
}
#[cfg(target_arch = "x86_64")]
fn x86_compress_cv_portable_wrapper(
cv: &[u32; 8],
block: *const u8,
counter: u64,
block_len: u32,
flags: u32,
) -> [u32; 8] {
let block_words = unsafe { words16_from_le_bytes_64(&*block.cast::<[u8; BLOCK_LEN]>()) };
first_8_words(super::compress(cv, &block_words, counter, block_len, flags))
}
#[cfg(target_arch = "x86_64")]
unsafe fn x86_compress_cv_sse41_wrapper(
cv: &[u32; 8],
block: *const u8,
counter: u64,
block_len: u32,
flags: u32,
) -> [u32; 8] {
unsafe { super::x86_64::compress_cv_sse41_bytes(cv, block, counter, block_len, flags) }
}
#[cfg(target_arch = "x86_64")]
unsafe fn x86_compress_cv_avx2_wrapper(
cv: &[u32; 8],
block: *const u8,
counter: u64,
block_len: u32,
flags: u32,
) -> [u32; 8] {
unsafe { super::x86_64::compress_cv_avx2_bytes(cv, block, counter, block_len, flags) }
}
#[cfg(target_arch = "x86_64")]
unsafe fn x86_compress_cv_avx512_wrapper(
cv: &[u32; 8],
block: *const u8,
counter: u64,
block_len: u32,
flags: u32,
) -> [u32; 8] {
#[cfg(any(target_os = "linux", target_os = "macos", target_os = "windows"))]
{
unsafe { super::x86_64::asm::compress_in_place_avx512(cv, block, counter, block_len, flags) }
}
#[cfg(not(any(target_os = "linux", target_os = "macos", target_os = "windows")))]
{
unsafe { super::x86_64::compress_cv_avx512_bytes(cv, block, counter, block_len, flags) }
}
}
#[cfg(target_arch = "x86_64")]
fn compress_avx2_wrapper(
chaining_value: &[u32; 8],
block_words: &[u32; 16],
counter: u64,
block_len: u32,
flags: u32,
) -> [u32; 16] {
unsafe { super::x86_64::compress_avx2(chaining_value, block_words, counter, block_len, flags) }
}
#[cfg(target_arch = "x86_64")]
fn chunk_compress_blocks_avx2_wrapper(
chaining_value: &mut [u32; 8],
chunk_counter: u64,
flags: u32,
blocks_compressed: &mut u8,
blocks: &[u8],
) {
unsafe { super::x86_64::chunk_compress_blocks_avx2(chaining_value, chunk_counter, flags, blocks_compressed, blocks) }
}
#[cfg(target_arch = "x86_64")]
fn parent_cv_avx2_wrapper(
left_child_cv: [u32; 8],
right_child_cv: [u32; 8],
key_words: [u32; 8],
flags: u32,
) -> [u32; 8] {
unsafe { super::x86_64::parent_cv_avx2(left_child_cv, right_child_cv, key_words, flags) }
}
#[cfg(target_arch = "x86_64")]
fn compress_avx512_wrapper(
chaining_value: &[u32; 8],
block_words: &[u32; 16],
counter: u64,
block_len: u32,
flags: u32,
) -> [u32; 16] {
unsafe { super::x86_64::compress_avx512(chaining_value, block_words, counter, block_len, flags) }
}
#[cfg(target_arch = "x86_64")]
fn chunk_compress_blocks_avx512_wrapper(
chaining_value: &mut [u32; 8],
chunk_counter: u64,
flags: u32,
blocks_compressed: &mut u8,
blocks: &[u8],
) {
unsafe {
super::x86_64::chunk_compress_blocks_avx512(chaining_value, chunk_counter, flags, blocks_compressed, blocks)
}
}
#[cfg(target_arch = "x86_64")]
fn parent_cv_avx512_wrapper(
left_child_cv: [u32; 8],
right_child_cv: [u32; 8],
key_words: [u32; 8],
flags: u32,
) -> [u32; 8] {
unsafe { super::x86_64::parent_cv_avx512(left_child_cv, right_child_cv, key_words, flags) }
}
#[cfg(target_arch = "x86_64")]
fn compress_sse41_wrapper(
chaining_value: &[u32; 8],
block_words: &[u32; 16],
counter: u64,
block_len: u32,
flags: u32,
) -> [u32; 16] {
unsafe { super::x86_64::compress_sse41(chaining_value, block_words, counter, block_len, flags) }
}
#[cfg(target_arch = "x86_64")]
fn chunk_compress_blocks_sse41_wrapper(
chaining_value: &mut [u32; 8],
chunk_counter: u64,
flags: u32,
blocks_compressed: &mut u8,
blocks: &[u8],
) {
unsafe { super::x86_64::chunk_compress_blocks_sse41(chaining_value, chunk_counter, flags, blocks_compressed, blocks) }
}
#[cfg(target_arch = "x86_64")]
fn parent_cv_sse41_wrapper(
left_child_cv: [u32; 8],
right_child_cv: [u32; 8],
key_words: [u32; 8],
flags: u32,
) -> [u32; 8] {
unsafe { super::x86_64::parent_cv_sse41(left_child_cv, right_child_cv, key_words, flags) }
}
#[cfg(target_arch = "x86_64")]
unsafe fn hash_many_contiguous_sse41_wrapper(
input: *const u8,
mut num_chunks: usize,
key: &[u32; 8],
mut counter: u64,
flags: u32,
mut out: *mut u8,
) {
let mut input = input;
while num_chunks >= super::x86_64::sse41::DEGREE {
let ptrs = unsafe {
[
input,
input.add(CHUNK_LEN),
input.add(2 * CHUNK_LEN),
input.add(3 * CHUNK_LEN),
]
};
unsafe {
super::x86_64::sse41::hash4(
&ptrs,
CHUNK_LEN / BLOCK_LEN,
key,
counter,
true,
flags,
CHUNK_START,
super::CHUNK_END,
out,
);
input = input.add(super::x86_64::sse41::DEGREE * CHUNK_LEN);
out = out.add(super::x86_64::sse41::DEGREE * OUT_LEN);
}
counter = counter.wrapping_add(super::x86_64::sse41::DEGREE as u64);
num_chunks -= super::x86_64::sse41::DEGREE;
}
if num_chunks != 0 {
let last = unsafe { input.add((num_chunks - 1) * CHUNK_LEN) };
let ptrs = unsafe {
[
input,
if num_chunks > 1 { input.add(CHUNK_LEN) } else { last },
if num_chunks > 2 { input.add(2 * CHUNK_LEN) } else { last },
last,
]
};
let mut tmp = [0u8; super::x86_64::sse41::DEGREE * OUT_LEN];
unsafe {
super::x86_64::sse41::hash4(
&ptrs,
CHUNK_LEN / BLOCK_LEN,
key,
counter,
true,
flags,
CHUNK_START,
super::CHUNK_END,
tmp.as_mut_ptr(),
);
core::ptr::copy_nonoverlapping(tmp.as_ptr(), out, num_chunks * OUT_LEN);
}
}
}
#[cfg(target_arch = "x86_64")]
unsafe fn hash_many_contiguous_avx2_wrapper(
input: *const u8,
mut num_chunks: usize,
key: &[u32; 8],
mut counter: u64,
flags: u32,
mut out: *mut u8,
) {
let mut input = input;
while num_chunks >= super::x86_64::avx2::DEGREE {
let ptrs = unsafe {
[
input,
input.add(CHUNK_LEN),
input.add(2 * CHUNK_LEN),
input.add(3 * CHUNK_LEN),
input.add(4 * CHUNK_LEN),
input.add(5 * CHUNK_LEN),
input.add(6 * CHUNK_LEN),
input.add(7 * CHUNK_LEN),
]
};
#[cfg(any(target_os = "linux", target_os = "macos", target_os = "windows"))]
unsafe {
debug_assert!(flags <= u8::MAX as u32);
super::x86_64::asm::hash_many_avx2(
ptrs.as_ptr(),
super::x86_64::avx2::DEGREE,
CHUNK_LEN / BLOCK_LEN,
key.as_ptr(),
counter,
true,
flags as u8,
CHUNK_START as u8,
super::CHUNK_END as u8,
out,
);
input = input.add(super::x86_64::avx2::DEGREE * CHUNK_LEN);
out = out.add(super::x86_64::avx2::DEGREE * OUT_LEN);
}
#[cfg(not(any(target_os = "linux", target_os = "macos", target_os = "windows")))]
unsafe {
super::x86_64::avx2::hash8(
&ptrs,
CHUNK_LEN / BLOCK_LEN,
key,
counter,
true,
flags,
CHUNK_START,
super::CHUNK_END,
out,
);
input = input.add(super::x86_64::avx2::DEGREE * CHUNK_LEN);
out = out.add(super::x86_64::avx2::DEGREE * OUT_LEN);
}
counter = counter.wrapping_add(super::x86_64::avx2::DEGREE as u64);
num_chunks -= super::x86_64::avx2::DEGREE;
}
if num_chunks != 0 {
#[cfg(any(target_os = "linux", target_os = "macos", target_os = "windows"))]
{
debug_assert!(num_chunks < super::x86_64::avx2::DEGREE);
debug_assert!(flags <= u8::MAX as u32);
let mut ptrs = [input; super::x86_64::avx2::DEGREE];
for (i, ptr) in ptrs.iter_mut().enumerate().take(num_chunks) {
*ptr = unsafe { input.add(i * CHUNK_LEN) };
}
unsafe {
super::x86_64::asm::hash_many_avx2(
ptrs.as_ptr(),
num_chunks,
CHUNK_LEN / BLOCK_LEN,
key.as_ptr(),
counter,
true,
flags as u8,
CHUNK_START as u8,
super::CHUNK_END as u8,
out,
);
}
}
#[cfg(not(any(target_os = "linux", target_os = "macos", target_os = "windows")))]
{
let last = unsafe { input.add((num_chunks - 1) * CHUNK_LEN) };
let ptrs = unsafe {
[
input,
if num_chunks > 1 { input.add(CHUNK_LEN) } else { last },
if num_chunks > 2 { input.add(2 * CHUNK_LEN) } else { last },
if num_chunks > 3 { input.add(3 * CHUNK_LEN) } else { last },
if num_chunks > 4 { input.add(4 * CHUNK_LEN) } else { last },
if num_chunks > 5 { input.add(5 * CHUNK_LEN) } else { last },
if num_chunks > 6 { input.add(6 * CHUNK_LEN) } else { last },
last,
]
};
let mut tmp = [0u8; super::x86_64::avx2::DEGREE * OUT_LEN];
unsafe {
super::x86_64::avx2::hash8(
&ptrs,
CHUNK_LEN / BLOCK_LEN,
key,
counter,
true,
flags,
CHUNK_START,
super::CHUNK_END,
tmp.as_mut_ptr(),
);
core::ptr::copy_nonoverlapping(tmp.as_ptr(), out, num_chunks * OUT_LEN);
}
}
}
}
#[cfg(target_arch = "x86_64")]
unsafe fn hash_many_contiguous_avx512_wrapper(
input: *const u8,
mut num_chunks: usize,
key: &[u32; 8],
mut counter: u64,
flags: u32,
mut out: *mut u8,
) {
let mut input = input;
while num_chunks >= super::x86_64::avx512::DEGREE {
#[cfg(any(target_os = "linux", target_os = "macos", target_os = "windows"))]
unsafe {
debug_assert!(flags <= u8::MAX as u32);
let ptrs = [
input,
input.add(CHUNK_LEN),
input.add(2 * CHUNK_LEN),
input.add(3 * CHUNK_LEN),
input.add(4 * CHUNK_LEN),
input.add(5 * CHUNK_LEN),
input.add(6 * CHUNK_LEN),
input.add(7 * CHUNK_LEN),
input.add(8 * CHUNK_LEN),
input.add(9 * CHUNK_LEN),
input.add(10 * CHUNK_LEN),
input.add(11 * CHUNK_LEN),
input.add(12 * CHUNK_LEN),
input.add(13 * CHUNK_LEN),
input.add(14 * CHUNK_LEN),
input.add(15 * CHUNK_LEN),
];
super::x86_64::asm::hash_many_avx512(
ptrs.as_ptr(),
super::x86_64::avx512::DEGREE,
CHUNK_LEN / BLOCK_LEN,
key.as_ptr(),
counter,
true,
flags as u8,
CHUNK_START as u8,
super::CHUNK_END as u8,
out,
);
input = input.add(super::x86_64::avx512::DEGREE * CHUNK_LEN);
out = out.add(super::x86_64::avx512::DEGREE * OUT_LEN);
}
#[cfg(not(any(target_os = "linux", target_os = "macos", target_os = "windows")))]
unsafe {
super::x86_64::avx512::hash16_contiguous(input, key, counter, flags, out);
input = input.add(super::x86_64::avx512::DEGREE * CHUNK_LEN);
out = out.add(super::x86_64::avx512::DEGREE * OUT_LEN);
}
counter = counter.wrapping_add(super::x86_64::avx512::DEGREE as u64);
num_chunks -= super::x86_64::avx512::DEGREE;
}
if num_chunks != 0 {
#[cfg(any(target_os = "linux", target_os = "macos", target_os = "windows"))]
{
debug_assert!(num_chunks < super::x86_64::avx512::DEGREE);
debug_assert!(flags <= u8::MAX as u32);
let mut ptrs = [input; super::x86_64::avx512::DEGREE];
for (i, ptr) in ptrs.iter_mut().enumerate().take(num_chunks) {
*ptr = unsafe { input.add(i * CHUNK_LEN) };
}
unsafe {
super::x86_64::asm::hash_many_avx512(
ptrs.as_ptr(),
num_chunks,
CHUNK_LEN / BLOCK_LEN,
key.as_ptr(),
counter,
true,
flags as u8,
CHUNK_START as u8,
super::CHUNK_END as u8,
out,
);
}
}
#[cfg(not(any(target_os = "linux", target_os = "macos", target_os = "windows")))]
{
debug_assert!(num_chunks < super::x86_64::avx512::DEGREE);
let mut tmp_input = [0u8; super::x86_64::avx512::DEGREE * CHUNK_LEN];
let mut tmp_out = [0u8; super::x86_64::avx512::DEGREE * OUT_LEN];
for i in 0..num_chunks {
unsafe {
core::ptr::copy_nonoverlapping(
input.add(i * CHUNK_LEN),
tmp_input.as_mut_ptr().add(i * CHUNK_LEN),
CHUNK_LEN,
);
}
}
let last_src_offset = (num_chunks - 1) * CHUNK_LEN;
for i in num_chunks..super::x86_64::avx512::DEGREE {
unsafe {
core::ptr::copy_nonoverlapping(
tmp_input.as_ptr().add(last_src_offset),
tmp_input.as_mut_ptr().add(i * CHUNK_LEN),
CHUNK_LEN,
);
}
}
unsafe {
super::x86_64::avx512::hash16_contiguous(tmp_input.as_ptr(), key, counter, flags, tmp_out.as_mut_ptr());
core::ptr::copy_nonoverlapping(tmp_out.as_ptr(), out, num_chunks * OUT_LEN);
}
}
}
}
#[cfg(target_arch = "aarch64")]
fn compress_neon_wrapper(
chaining_value: &[u32; 8],
block_words: &[u32; 16],
counter: u64,
block_len: u32,
flags: u32,
) -> [u32; 16] {
unsafe { super::aarch64::compress_neon(chaining_value, block_words, counter, block_len, flags) }
}
#[cfg(target_arch = "aarch64")]
fn chunk_compress_blocks_neon_wrapper(
chaining_value: &mut [u32; 8],
chunk_counter: u64,
flags: u32,
blocks_compressed: &mut u8,
blocks: &[u8],
) {
unsafe { super::aarch64::chunk_compress_blocks_neon(chaining_value, chunk_counter, flags, blocks_compressed, blocks) }
}
#[cfg(target_arch = "aarch64")]
fn parent_cv_neon_wrapper(
left_child_cv: [u32; 8],
right_child_cv: [u32; 8],
key_words: [u32; 8],
flags: u32,
) -> [u32; 8] {
unsafe { super::aarch64::parent_cv_neon(left_child_cv, right_child_cv, key_words, flags) }
}
#[cfg(target_arch = "aarch64")]
unsafe fn hash_many_contiguous_neon_wrapper(
input: *const u8,
num_chunks: usize,
key: &[u32; 8],
counter: u64,
flags: u32,
out: *mut u8,
) {
unsafe { super::aarch64::hash_many_contiguous_neon(input, num_chunks, key, counter, flags, out) }
}