#![allow(unsafe_code)]
#![allow(clippy::inline_always)]
#[cfg(target_arch = "x86_64")]
use core::arch::x86_64::*;
#[cfg(target_arch = "x86_64")]
static K4: [__m128i; 16] = unsafe {
core::mem::transmute([
[0x428a2f98u32, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5],
[0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5],
[0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3],
[0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174],
[0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc],
[0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da],
[0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7],
[0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967],
[0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13],
[0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85],
[0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3],
[0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070],
[0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5],
[0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3],
[0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208],
[0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2],
])
};
#[cfg(target_arch = "x86_64")]
static BSWAP_MASK: __m128i = unsafe { core::mem::transmute([3u8, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12]) };
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "sha,sse4.1")]
pub(crate) unsafe fn compress_blocks_sha_ni(state: &mut [u32; 8], blocks: &[u8]) {
debug_assert_eq!(blocks.len() % 64, 0);
if blocks.is_empty() {
return;
}
unsafe {
let mut abef;
let mut cdgh;
{
let tmp0 = _mm_shuffle_epi32(_mm_loadu_si128(state.as_ptr().cast()), 0xB1);
let tmp1 = _mm_shuffle_epi32(_mm_loadu_si128(state.as_ptr().add(4).cast()), 0x1B);
abef = _mm_alignr_epi8(tmp0, tmp1, 8);
cdgh = _mm_blend_epi16(tmp1, tmp0, 0xF0);
}
let num_blocks = blocks.len() / 64;
let mut ptr = blocks.as_ptr();
for _ in 0..num_blocks {
let abef_save = abef;
let cdgh_save = cdgh;
let mut msg0 = _mm_shuffle_epi8(_mm_loadu_si128(ptr.cast()), BSWAP_MASK);
let mut msg1 = _mm_shuffle_epi8(_mm_loadu_si128(ptr.add(16).cast()), BSWAP_MASK);
let mut msg2 = _mm_shuffle_epi8(_mm_loadu_si128(ptr.add(32).cast()), BSWAP_MASK);
let mut msg3 = _mm_shuffle_epi8(_mm_loadu_si128(ptr.add(48).cast()), BSWAP_MASK);
let mut tmp = _mm_add_epi32(msg0, K4[0]);
cdgh = _mm_sha256rnds2_epu32(cdgh, abef, tmp);
tmp = _mm_shuffle_epi32(tmp, 0x0E);
abef = _mm_sha256rnds2_epu32(abef, cdgh, tmp);
tmp = _mm_add_epi32(msg1, K4[1]);
cdgh = _mm_sha256rnds2_epu32(cdgh, abef, tmp);
tmp = _mm_shuffle_epi32(tmp, 0x0E);
abef = _mm_sha256rnds2_epu32(abef, cdgh, tmp);
msg0 = _mm_sha256msg1_epu32(msg0, msg1);
tmp = _mm_add_epi32(msg2, K4[2]);
cdgh = _mm_sha256rnds2_epu32(cdgh, abef, tmp);
tmp = _mm_shuffle_epi32(tmp, 0x0E);
abef = _mm_sha256rnds2_epu32(abef, cdgh, tmp);
msg1 = _mm_sha256msg1_epu32(msg1, msg2);
tmp = _mm_add_epi32(msg3, K4[3]);
cdgh = _mm_sha256rnds2_epu32(cdgh, abef, tmp);
tmp = _mm_shuffle_epi32(tmp, 0x0E);
abef = _mm_sha256rnds2_epu32(abef, cdgh, tmp);
msg0 = _mm_add_epi32(msg0, _mm_alignr_epi8(msg3, msg2, 4));
msg0 = _mm_sha256msg2_epu32(msg0, msg3);
msg2 = _mm_sha256msg1_epu32(msg2, msg3);
tmp = _mm_add_epi32(msg0, K4[4]);
cdgh = _mm_sha256rnds2_epu32(cdgh, abef, tmp);
tmp = _mm_shuffle_epi32(tmp, 0x0E);
abef = _mm_sha256rnds2_epu32(abef, cdgh, tmp);
msg1 = _mm_add_epi32(msg1, _mm_alignr_epi8(msg0, msg3, 4));
msg1 = _mm_sha256msg2_epu32(msg1, msg0);
msg3 = _mm_sha256msg1_epu32(msg3, msg0);
tmp = _mm_add_epi32(msg1, K4[5]);
cdgh = _mm_sha256rnds2_epu32(cdgh, abef, tmp);
tmp = _mm_shuffle_epi32(tmp, 0x0E);
abef = _mm_sha256rnds2_epu32(abef, cdgh, tmp);
msg2 = _mm_add_epi32(msg2, _mm_alignr_epi8(msg1, msg0, 4));
msg2 = _mm_sha256msg2_epu32(msg2, msg1);
msg0 = _mm_sha256msg1_epu32(msg0, msg1);
tmp = _mm_add_epi32(msg2, K4[6]);
cdgh = _mm_sha256rnds2_epu32(cdgh, abef, tmp);
tmp = _mm_shuffle_epi32(tmp, 0x0E);
abef = _mm_sha256rnds2_epu32(abef, cdgh, tmp);
msg3 = _mm_add_epi32(msg3, _mm_alignr_epi8(msg2, msg1, 4));
msg3 = _mm_sha256msg2_epu32(msg3, msg2);
msg1 = _mm_sha256msg1_epu32(msg1, msg2);
tmp = _mm_add_epi32(msg3, K4[7]);
cdgh = _mm_sha256rnds2_epu32(cdgh, abef, tmp);
tmp = _mm_shuffle_epi32(tmp, 0x0E);
abef = _mm_sha256rnds2_epu32(abef, cdgh, tmp);
msg0 = _mm_add_epi32(msg0, _mm_alignr_epi8(msg3, msg2, 4));
msg0 = _mm_sha256msg2_epu32(msg0, msg3);
msg2 = _mm_sha256msg1_epu32(msg2, msg3);
tmp = _mm_add_epi32(msg0, K4[8]);
cdgh = _mm_sha256rnds2_epu32(cdgh, abef, tmp);
tmp = _mm_shuffle_epi32(tmp, 0x0E);
abef = _mm_sha256rnds2_epu32(abef, cdgh, tmp);
msg1 = _mm_add_epi32(msg1, _mm_alignr_epi8(msg0, msg3, 4));
msg1 = _mm_sha256msg2_epu32(msg1, msg0);
msg3 = _mm_sha256msg1_epu32(msg3, msg0);
tmp = _mm_add_epi32(msg1, K4[9]);
cdgh = _mm_sha256rnds2_epu32(cdgh, abef, tmp);
tmp = _mm_shuffle_epi32(tmp, 0x0E);
abef = _mm_sha256rnds2_epu32(abef, cdgh, tmp);
msg2 = _mm_add_epi32(msg2, _mm_alignr_epi8(msg1, msg0, 4));
msg2 = _mm_sha256msg2_epu32(msg2, msg1);
msg0 = _mm_sha256msg1_epu32(msg0, msg1);
tmp = _mm_add_epi32(msg2, K4[10]);
cdgh = _mm_sha256rnds2_epu32(cdgh, abef, tmp);
tmp = _mm_shuffle_epi32(tmp, 0x0E);
abef = _mm_sha256rnds2_epu32(abef, cdgh, tmp);
msg3 = _mm_add_epi32(msg3, _mm_alignr_epi8(msg2, msg1, 4));
msg3 = _mm_sha256msg2_epu32(msg3, msg2);
msg1 = _mm_sha256msg1_epu32(msg1, msg2);
tmp = _mm_add_epi32(msg3, K4[11]);
cdgh = _mm_sha256rnds2_epu32(cdgh, abef, tmp);
tmp = _mm_shuffle_epi32(tmp, 0x0E);
abef = _mm_sha256rnds2_epu32(abef, cdgh, tmp);
msg0 = _mm_add_epi32(msg0, _mm_alignr_epi8(msg3, msg2, 4));
msg0 = _mm_sha256msg2_epu32(msg0, msg3);
msg2 = _mm_sha256msg1_epu32(msg2, msg3);
tmp = _mm_add_epi32(msg0, K4[12]);
cdgh = _mm_sha256rnds2_epu32(cdgh, abef, tmp);
tmp = _mm_shuffle_epi32(tmp, 0x0E);
abef = _mm_sha256rnds2_epu32(abef, cdgh, tmp);
msg1 = _mm_add_epi32(msg1, _mm_alignr_epi8(msg0, msg3, 4));
msg1 = _mm_sha256msg2_epu32(msg1, msg0);
msg3 = _mm_sha256msg1_epu32(msg3, msg0);
tmp = _mm_add_epi32(msg1, K4[13]);
cdgh = _mm_sha256rnds2_epu32(cdgh, abef, tmp);
tmp = _mm_shuffle_epi32(tmp, 0x0E);
abef = _mm_sha256rnds2_epu32(abef, cdgh, tmp);
msg2 = _mm_add_epi32(msg2, _mm_alignr_epi8(msg1, msg0, 4));
msg2 = _mm_sha256msg2_epu32(msg2, msg1);
tmp = _mm_add_epi32(msg2, K4[14]);
cdgh = _mm_sha256rnds2_epu32(cdgh, abef, tmp);
tmp = _mm_shuffle_epi32(tmp, 0x0E);
abef = _mm_sha256rnds2_epu32(abef, cdgh, tmp);
msg3 = _mm_add_epi32(msg3, _mm_alignr_epi8(msg2, msg1, 4));
msg3 = _mm_sha256msg2_epu32(msg3, msg2);
tmp = _mm_add_epi32(msg3, K4[15]);
cdgh = _mm_sha256rnds2_epu32(cdgh, abef, tmp);
tmp = _mm_shuffle_epi32(tmp, 0x0E);
abef = _mm_sha256rnds2_epu32(abef, cdgh, tmp);
abef = _mm_add_epi32(abef, abef_save);
cdgh = _mm_add_epi32(cdgh, cdgh_save);
ptr = ptr.add(64);
}
let tmp0 = _mm_shuffle_epi32(abef, 0x1B);
let tmp1 = _mm_shuffle_epi32(cdgh, 0xB1);
_mm_storeu_si128(state.as_mut_ptr().cast(), _mm_blend_epi16(tmp0, tmp1, 0xF0));
_mm_storeu_si128(state.as_mut_ptr().add(4).cast(), _mm_alignr_epi8(tmp1, tmp0, 8));
} }