use crate::platform::Caps;
#[cfg(target_arch = "aarch64")]
use crate::platform::caps::aarch64;
#[cfg(target_arch = "powerpc64")]
use crate::platform::caps::power;
#[cfg(target_arch = "riscv64")]
use crate::platform::caps::riscv;
#[cfg(target_arch = "s390x")]
use crate::platform::caps::s390x;
#[cfg(target_arch = "wasm32")]
use crate::platform::caps::wasm;
#[cfg(target_arch = "x86_64")]
use crate::platform::caps::x86;
pub(crate) type CompressFn = fn(&mut [u32; 8], &[u8; 64], u64, bool);
pub(crate) type CompressBlocksFn = fn(&mut [u32; 8], &[u8], &mut u64);
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
#[repr(u8)]
#[non_exhaustive]
#[cfg_attr(target_os = "macos", allow(dead_code))]
pub enum Blake2sKernelId {
Portable = 0,
#[cfg(target_arch = "x86_64")]
X86Avx2 = 1,
#[cfg(target_arch = "x86_64")]
X86Avx512vl = 2,
#[allow(dead_code)]
#[cfg(target_arch = "aarch64")]
Aarch64Neon = 3,
#[allow(dead_code)]
#[cfg(target_arch = "s390x")]
S390xVector = 4,
#[allow(dead_code)]
#[cfg(target_arch = "powerpc64")]
PowerVsx = 5,
#[cfg(target_arch = "riscv64")]
Riscv64V = 6,
#[cfg(target_arch = "wasm32")]
WasmSimd128 = 7,
}
impl Blake2sKernelId {
#[cfg(any(test, feature = "diag"))]
#[inline]
#[must_use]
pub const fn as_str(self) -> &'static str {
match self {
Self::Portable => "portable",
#[cfg(target_arch = "x86_64")]
Self::X86Avx2 => "x86/avx2",
#[cfg(target_arch = "x86_64")]
Self::X86Avx512vl => "x86/avx512vl",
#[cfg(target_arch = "aarch64")]
Self::Aarch64Neon => "aarch64/neon",
#[cfg(target_arch = "s390x")]
Self::S390xVector => "s390x/vector",
#[cfg(target_arch = "powerpc64")]
Self::PowerVsx => "power/vsx",
#[cfg(target_arch = "riscv64")]
Self::Riscv64V => "riscv64/v",
#[cfg(target_arch = "wasm32")]
Self::WasmSimd128 => "wasm/simd128",
}
}
}
#[cfg(target_arch = "aarch64")]
fn compress_aarch64_neon(h: &mut [u32; 8], block: &[u8; 64], t: u64, last: bool) {
unsafe { super::aarch64::compress_neon(h, block, t, last) }
}
#[cfg(target_arch = "powerpc64")]
fn compress_power_vsx(h: &mut [u32; 8], block: &[u8; 64], t: u64, last: bool) {
unsafe { super::power::compress_vsx(h, block, t, last) }
}
#[cfg(target_arch = "riscv64")]
fn compress_riscv64_v(h: &mut [u32; 8], block: &[u8; 64], t: u64, last: bool) {
unsafe { super::riscv64::compress_rvv(h, block, t, last) }
}
#[cfg(target_arch = "s390x")]
fn compress_s390x_vector(h: &mut [u32; 8], block: &[u8; 64], t: u64, last: bool) {
unsafe { super::s390x::compress_vector(h, block, t, last) }
}
#[cfg(target_arch = "wasm32")]
fn compress_wasm_simd128(h: &mut [u32; 8], block: &[u8; 64], t: u64, last: bool) {
unsafe { super::wasm::compress_simd128(h, block, t, last) }
}
#[cfg(target_arch = "x86_64")]
fn compress_x86_avx2(h: &mut [u32; 8], block: &[u8; 64], t: u64, last: bool) {
unsafe { super::x86_64::compress_avx2(h, block, t, last) }
}
#[cfg(target_arch = "x86_64")]
fn compress_x86_avx512vl(h: &mut [u32; 8], block: &[u8; 64], t: u64, last: bool) {
unsafe { super::x86_64::compress_avx512vl(h, block, t, last) }
}
#[inline(always)]
fn compress_blocks_with(h: &mut [u32; 8], blocks: &[u8], t: &mut u64, compress: CompressFn) {
debug_assert_eq!(blocks.len() % 64, 0);
let mut chunks = blocks.chunks_exact(64);
for chunk in &mut chunks {
*t = t.strict_add(64);
let block = unsafe { &*chunk.as_ptr().cast::<[u8; 64]>() };
compress(h, block, *t, false);
}
debug_assert!(chunks.remainder().is_empty());
}
fn compress_blocks_portable(h: &mut [u32; 8], blocks: &[u8], t: &mut u64) {
compress_blocks_with(h, blocks, t, compress);
}
#[cfg(target_arch = "aarch64")]
fn compress_blocks_aarch64_neon(h: &mut [u32; 8], blocks: &[u8], t: &mut u64) {
compress_blocks_with(h, blocks, t, compress_aarch64_neon);
}
#[cfg(target_arch = "powerpc64")]
fn compress_blocks_power_vsx(h: &mut [u32; 8], blocks: &[u8], t: &mut u64) {
compress_blocks_with(h, blocks, t, compress_power_vsx);
}
#[cfg(target_arch = "riscv64")]
fn compress_blocks_riscv64_v(h: &mut [u32; 8], blocks: &[u8], t: &mut u64) {
compress_blocks_with(h, blocks, t, compress_riscv64_v);
}
#[cfg(target_arch = "s390x")]
fn compress_blocks_s390x_vector(h: &mut [u32; 8], blocks: &[u8], t: &mut u64) {
compress_blocks_with(h, blocks, t, compress_s390x_vector);
}
#[cfg(target_arch = "wasm32")]
fn compress_blocks_wasm_simd128(h: &mut [u32; 8], blocks: &[u8], t: &mut u64) {
compress_blocks_with(h, blocks, t, compress_wasm_simd128);
}
#[cfg(target_arch = "x86_64")]
fn compress_blocks_x86_avx2(h: &mut [u32; 8], blocks: &[u8], t: &mut u64) {
compress_blocks_with(h, blocks, t, compress_x86_avx2);
}
#[cfg(target_arch = "x86_64")]
fn compress_blocks_x86_avx512vl(h: &mut [u32; 8], blocks: &[u8], t: &mut u64) {
compress_blocks_with(h, blocks, t, compress_x86_avx512vl);
}
#[must_use]
pub(crate) fn compress_fn(id: Blake2sKernelId) -> CompressFn {
match id {
Blake2sKernelId::Portable => compress,
#[cfg(target_arch = "x86_64")]
Blake2sKernelId::X86Avx2 => compress_x86_avx2,
#[cfg(target_arch = "x86_64")]
Blake2sKernelId::X86Avx512vl => compress_x86_avx512vl,
#[cfg(target_arch = "aarch64")]
Blake2sKernelId::Aarch64Neon => compress_aarch64_neon,
#[cfg(target_arch = "s390x")]
Blake2sKernelId::S390xVector => compress_s390x_vector,
#[cfg(target_arch = "powerpc64")]
Blake2sKernelId::PowerVsx => compress_power_vsx,
#[cfg(target_arch = "riscv64")]
Blake2sKernelId::Riscv64V => compress_riscv64_v,
#[cfg(target_arch = "wasm32")]
Blake2sKernelId::WasmSimd128 => compress_wasm_simd128,
}
}
#[must_use]
pub(crate) fn compress_blocks_fn(id: Blake2sKernelId) -> CompressBlocksFn {
match id {
Blake2sKernelId::Portable => compress_blocks_portable,
#[cfg(target_arch = "x86_64")]
Blake2sKernelId::X86Avx2 => compress_blocks_x86_avx2,
#[cfg(target_arch = "x86_64")]
Blake2sKernelId::X86Avx512vl => compress_blocks_x86_avx512vl,
#[cfg(target_arch = "aarch64")]
Blake2sKernelId::Aarch64Neon => compress_blocks_aarch64_neon,
#[cfg(target_arch = "s390x")]
Blake2sKernelId::S390xVector => compress_blocks_s390x_vector,
#[cfg(target_arch = "powerpc64")]
Blake2sKernelId::PowerVsx => compress_blocks_power_vsx,
#[cfg(target_arch = "riscv64")]
Blake2sKernelId::Riscv64V => compress_blocks_riscv64_v,
#[cfg(target_arch = "wasm32")]
Blake2sKernelId::WasmSimd128 => compress_blocks_wasm_simd128,
}
}
#[inline]
#[must_use]
#[allow(dead_code)] pub const fn required_caps(id: Blake2sKernelId) -> Caps {
match id {
Blake2sKernelId::Portable => Caps::NONE,
#[cfg(target_arch = "x86_64")]
Blake2sKernelId::X86Avx2 => x86::AVX2,
#[cfg(target_arch = "x86_64")]
Blake2sKernelId::X86Avx512vl => x86::AVX512F.union(x86::AVX512VL),
#[cfg(target_arch = "aarch64")]
Blake2sKernelId::Aarch64Neon => aarch64::NEON,
#[cfg(target_arch = "s390x")]
Blake2sKernelId::S390xVector => s390x::VECTOR,
#[cfg(target_arch = "powerpc64")]
Blake2sKernelId::PowerVsx => power::VSX,
#[cfg(target_arch = "riscv64")]
Blake2sKernelId::Riscv64V => riscv::V,
#[cfg(target_arch = "wasm32")]
Blake2sKernelId::WasmSimd128 => wasm::SIMD128,
}
}
#[cfg(test)]
pub const ALL: &[Blake2sKernelId] = &[
Blake2sKernelId::Portable,
#[cfg(target_arch = "x86_64")]
Blake2sKernelId::X86Avx2,
#[cfg(target_arch = "x86_64")]
Blake2sKernelId::X86Avx512vl,
#[cfg(target_arch = "aarch64")]
Blake2sKernelId::Aarch64Neon,
#[cfg(target_arch = "s390x")]
Blake2sKernelId::S390xVector,
#[cfg(target_arch = "powerpc64")]
Blake2sKernelId::PowerVsx,
#[cfg(target_arch = "riscv64")]
Blake2sKernelId::Riscv64V,
#[cfg(target_arch = "wasm32")]
Blake2sKernelId::WasmSimd128,
];
pub(crate) const COMPILE_TIME_HW: bool = cfg!(any(
all(target_arch = "x86_64", target_feature = "avx2"),
all(
target_arch = "x86_64",
target_feature = "avx512f",
target_feature = "avx512vl"
),
));
#[inline(always)]
pub(crate) fn compile_time_best() -> CompressFn {
#[cfg(all(target_arch = "x86_64", target_feature = "avx512f", target_feature = "avx512vl"))]
{
return compress_x86_avx512vl;
}
#[cfg(all(
target_arch = "x86_64",
target_feature = "avx2",
not(all(target_feature = "avx512f", target_feature = "avx512vl"))
))]
{
return compress_x86_avx2;
}
#[allow(unreachable_code)]
compress
}
#[inline(always)]
pub(crate) fn compile_time_best_blocks() -> CompressBlocksFn {
#[cfg(all(target_arch = "x86_64", target_feature = "avx512f", target_feature = "avx512vl"))]
{
return compress_blocks_x86_avx512vl;
}
#[cfg(all(
target_arch = "x86_64",
target_feature = "avx2",
not(all(target_feature = "avx512f", target_feature = "avx512vl"))
))]
{
return compress_blocks_x86_avx2;
}
#[allow(unreachable_code)]
compress_blocks_portable
}
pub(crate) const IV: [u32; 8] = [
0x6a09_e667,
0xbb67_ae85,
0x3c6e_f372,
0xa54f_f53a,
0x510e_527f,
0x9b05_688c,
0x1f83_d9ab,
0x5be0_cd19,
];
#[allow(dead_code)] pub(crate) const SIGMA: [[u8; 16]; 10] = [
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
[14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3],
[11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4],
[7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8],
[9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13],
[2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9],
[12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11],
[13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10],
[6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5],
[10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0],
];
#[derive(Clone, Copy)]
struct U32x4(u32, u32, u32, u32);
impl U32x4 {
#[inline(always)]
const fn new(a: u32, b: u32, c: u32, d: u32) -> Self {
Self(a, b, c, d)
}
#[inline(always)]
fn gather(src: &[u32; 16], i0: usize, i1: usize, i2: usize, i3: usize) -> Self {
debug_assert!(i0 < src.len() && i1 < src.len() && i2 < src.len() && i3 < src.len());
unsafe {
Self(
*src.get_unchecked(i0),
*src.get_unchecked(i1),
*src.get_unchecked(i2),
*src.get_unchecked(i3),
)
}
}
#[inline(always)]
fn wrapping_add(self, rhs: Self) -> Self {
Self(
self.0.wrapping_add(rhs.0),
self.1.wrapping_add(rhs.1),
self.2.wrapping_add(rhs.2),
self.3.wrapping_add(rhs.3),
)
}
#[inline(always)]
fn rotate_right_const(self, n: u32) -> Self {
Self(
self.0.rotate_right(n),
self.1.rotate_right(n),
self.2.rotate_right(n),
self.3.rotate_right(n),
)
}
#[inline(always)]
const fn shuffle_left_1(self) -> Self {
Self(self.1, self.2, self.3, self.0)
}
#[inline(always)]
const fn shuffle_left_2(self) -> Self {
Self(self.2, self.3, self.0, self.1)
}
#[inline(always)]
const fn shuffle_left_3(self) -> Self {
Self(self.3, self.0, self.1, self.2)
}
#[inline(always)]
const fn shuffle_right_1(self) -> Self {
self.shuffle_left_3()
}
#[inline(always)]
const fn shuffle_right_2(self) -> Self {
self.shuffle_left_2()
}
#[inline(always)]
const fn shuffle_right_3(self) -> Self {
self.shuffle_left_1()
}
}
impl core::ops::BitXor for U32x4 {
type Output = Self;
#[inline(always)]
fn bitxor(self, rhs: Self) -> Self::Output {
Self(self.0 ^ rhs.0, self.1 ^ rhs.1, self.2 ^ rhs.2, self.3 ^ rhs.3)
}
}
#[inline(always)]
fn quarter_round(v: &mut [U32x4; 4], rd: u32, rb: u32, m: U32x4) {
v[0] = v[0].wrapping_add(v[1]).wrapping_add(m);
v[3] = (v[3] ^ v[0]).rotate_right_const(rd);
v[2] = v[2].wrapping_add(v[3]);
v[1] = (v[1] ^ v[2]).rotate_right_const(rb);
}
#[inline(always)]
fn shuffle(v: &mut [U32x4; 4]) {
v[1] = v[1].shuffle_left_1();
v[2] = v[2].shuffle_left_2();
v[3] = v[3].shuffle_left_3();
}
#[inline(always)]
fn unshuffle(v: &mut [U32x4; 4]) {
v[1] = v[1].shuffle_right_1();
v[2] = v[2].shuffle_right_2();
v[3] = v[3].shuffle_right_3();
}
#[inline(always)]
fn round(v: &mut [U32x4; 4], m: &[u32; 16], s: &[u8; 16]) {
quarter_round(
v,
16,
12,
U32x4::gather(m, s[0] as usize, s[2] as usize, s[4] as usize, s[6] as usize),
);
quarter_round(
v,
8,
7,
U32x4::gather(m, s[1] as usize, s[3] as usize, s[5] as usize, s[7] as usize),
);
shuffle(v);
quarter_round(
v,
16,
12,
U32x4::gather(m, s[8] as usize, s[10] as usize, s[12] as usize, s[14] as usize),
);
quarter_round(
v,
8,
7,
U32x4::gather(m, s[9] as usize, s[11] as usize, s[13] as usize, s[15] as usize),
);
unshuffle(v);
}
#[inline(always)]
#[allow(clippy::indexing_slicing)]
pub(crate) fn load_msg(block: &[u8; 64]) -> [u32; 16] {
let mut m = [0u32; 16];
let src = block.as_ptr();
for (i, word) in m.iter_mut().enumerate() {
let raw = unsafe { core::ptr::read_unaligned(src.add(i.strict_mul(4)).cast::<u32>()) };
*word = u32::from_le(raw);
}
m
}
#[cfg(any(
target_arch = "x86_64",
target_arch = "aarch64",
target_arch = "wasm32",
target_arch = "riscv64",
target_arch = "s390x",
target_arch = "powerpc64"
))]
#[inline(always)]
pub(crate) fn init_v(h: &[u32; 8], t: u64, last: bool) -> [u32; 16] {
let mut v = [0u32; 16];
v[..8].copy_from_slice(h);
v[8] = IV[0];
v[9] = IV[1];
v[10] = IV[2];
v[11] = IV[3];
v[12] = IV[4] ^ (t as u32);
v[13] = IV[5] ^ ((t >> 32) as u32);
v[14] = if last { IV[6] ^ u32::MAX } else { IV[6] };
v[15] = IV[7];
v
}
#[allow(clippy::indexing_slicing)]
pub(crate) fn compress(h: &mut [u32; 8], block: &[u8; 64], t: u64, last: bool) {
let m = load_msg(block);
let t0 = t as u32;
let t1 = (t >> 32) as u32;
let f0 = if last { u32::MAX } else { 0 };
let mut v = [
U32x4::new(h[0], h[1], h[2], h[3]),
U32x4::new(h[4], h[5], h[6], h[7]),
U32x4::new(IV[0], IV[1], IV[2], IV[3]),
U32x4::new(IV[4] ^ t0, IV[5] ^ t1, IV[6] ^ f0, IV[7]),
];
round(&mut v, &m, &SIGMA[0]);
round(&mut v, &m, &SIGMA[1]);
round(&mut v, &m, &SIGMA[2]);
round(&mut v, &m, &SIGMA[3]);
round(&mut v, &m, &SIGMA[4]);
round(&mut v, &m, &SIGMA[5]);
round(&mut v, &m, &SIGMA[6]);
round(&mut v, &m, &SIGMA[7]);
round(&mut v, &m, &SIGMA[8]);
round(&mut v, &m, &SIGMA[9]);
let a = v[0] ^ v[2];
let b = v[1] ^ v[3];
h[0] ^= a.0;
h[1] ^= a.1;
h[2] ^= a.2;
h[3] ^= a.3;
h[4] ^= b.0;
h[5] ^= b.1;
h[6] ^= b.2;
h[7] ^= b.3;
}