#![allow(unsafe_code)]
#![allow(unused_unsafe)]
#[cfg(target_arch = "x86_64")]
pub(super) fn sha256_supported() -> bool {
std::is_x86_feature_detected!("sha")
&& std::is_x86_feature_detected!("sse2")
&& std::is_x86_feature_detected!("ssse3")
&& std::is_x86_feature_detected!("sse4.1")
}
#[cfg(target_arch = "aarch64")]
pub(super) fn sha256_supported() -> bool {
std::arch::is_aarch64_feature_detected!("sha2")
}
pub(super) fn compress256(h: &mut [u32; 8], block: &[u8; 64]) {
compress256_blocks(h, block);
}
pub(super) fn compress256_blocks(h: &mut [u32; 8], data: &[u8]) {
debug_assert!(data.len().is_multiple_of(64));
if data.is_empty() {
return;
}
#[cfg(target_arch = "x86_64")]
unsafe {
x86::compress256_blocks(h, data)
}
#[cfg(target_arch = "aarch64")]
unsafe {
arm::compress256_blocks(h, data)
}
}
#[cfg(target_arch = "aarch64")]
pub(super) fn sha512_supported() -> bool {
std::arch::is_aarch64_feature_detected!("sha3")
}
#[cfg(target_arch = "aarch64")]
pub(super) fn compress512(h: &mut [u64; 8], block: &[u8; 128]) {
unsafe { arm::compress512(h, block) }
}
#[cfg(target_arch = "x86_64")]
mod x86 {
use crate::hash::sha256::K256;
use core::arch::x86_64::*;
#[target_feature(enable = "sha,sse2,ssse3,sse4.1")]
pub(super) unsafe fn compress256_blocks(state: &mut [u32; 8], data: &[u8]) {
unsafe {
let mask = _mm_set_epi64x(
0x0c0d_0e0f_0809_0a0bu64 as i64,
0x0405_0607_0001_0203u64 as i64,
);
let tmp0 = _mm_loadu_si128(state.as_ptr() as *const __m128i); let s1_0 = _mm_loadu_si128(state.as_ptr().add(4) as *const __m128i); let tmp = _mm_shuffle_epi32(tmp0, 0xB1); let s1 = _mm_shuffle_epi32(s1_0, 0x1B); let mut state0 = _mm_alignr_epi8(tmp, s1, 8); let mut state1 = _mm_blend_epi16(s1, tmp, 0xF0);
let kptr = K256.as_ptr();
let base = data.as_ptr();
let nblocks = data.len() / 64;
for blk in 0..nblocks {
let bptr = base.add(blk * 64);
let abef_save = state0;
let cdgh_save = state1;
let mut m = [
_mm_shuffle_epi8(_mm_loadu_si128(bptr as *const __m128i), mask),
_mm_shuffle_epi8(_mm_loadu_si128(bptr.add(16) as *const __m128i), mask),
_mm_shuffle_epi8(_mm_loadu_si128(bptr.add(32) as *const __m128i), mask),
_mm_shuffle_epi8(_mm_loadu_si128(bptr.add(48) as *const __m128i), mask),
];
for g in 0..16usize {
let i = g % 4;
let mut msg =
_mm_add_epi32(m[i], _mm_loadu_si128(kptr.add(4 * g) as *const __m128i));
state1 = _mm_sha256rnds2_epu32(state1, state0, msg);
if (3..=14).contains(&g) {
let tmp = _mm_alignr_epi8(m[i], m[(i + 3) % 4], 4);
m[(i + 1) % 4] = _mm_add_epi32(m[(i + 1) % 4], tmp);
m[(i + 1) % 4] = _mm_sha256msg2_epu32(m[(i + 1) % 4], m[i]);
}
msg = _mm_shuffle_epi32(msg, 0x0E);
state0 = _mm_sha256rnds2_epu32(state0, state1, msg);
if (1..=12).contains(&g) {
m[(i + 3) % 4] = _mm_sha256msg1_epu32(m[(i + 3) % 4], m[i]);
}
}
state0 = _mm_add_epi32(state0, abef_save);
state1 = _mm_add_epi32(state1, cdgh_save);
}
let tmp = _mm_shuffle_epi32(state0, 0x1B); let s1 = _mm_shuffle_epi32(state1, 0xB1); let out0 = _mm_blend_epi16(tmp, s1, 0xF0); let out1 = _mm_alignr_epi8(s1, tmp, 8); _mm_storeu_si128(state.as_mut_ptr() as *mut __m128i, out0);
_mm_storeu_si128(state.as_mut_ptr().add(4) as *mut __m128i, out1);
}
}
}
#[cfg(target_arch = "aarch64")]
mod arm {
use crate::hash::sha256::K256;
use crate::hash::sha512::K512;
use core::arch::aarch64::*;
#[target_feature(enable = "sha2")]
pub(super) unsafe fn compress256_blocks(state: &mut [u32; 8], data: &[u8]) {
unsafe {
let mut abcd = vld1q_u32(state.as_ptr());
let mut efgh = vld1q_u32(state.as_ptr().add(4));
let base = data.as_ptr();
let nblocks = data.len() / 64;
for blk in 0..nblocks {
let bptr = base.add(blk * 64);
let abcd0 = abcd;
let efgh0 = efgh;
let mut m = [
vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(bptr))),
vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(bptr.add(16)))),
vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(bptr.add(32)))),
vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(bptr.add(48)))),
];
for g in 0..16usize {
let i = g % 4;
let wk = vaddq_u32(m[i], vld1q_u32(K256.as_ptr().add(4 * g)));
if g < 12 {
m[i] = vsha256su1q_u32(
vsha256su0q_u32(m[i], m[(i + 1) % 4]),
m[(i + 2) % 4],
m[(i + 3) % 4],
);
}
let tmp = abcd;
abcd = vsha256hq_u32(abcd, efgh, wk);
efgh = vsha256h2q_u32(efgh, tmp, wk);
}
abcd = vaddq_u32(abcd, abcd0);
efgh = vaddq_u32(efgh, efgh0);
}
vst1q_u32(state.as_mut_ptr(), abcd);
vst1q_u32(state.as_mut_ptr().add(4), efgh);
}
}
#[target_feature(enable = "sha3")]
pub(super) unsafe fn compress512(state: &mut [u64; 8], block: &[u8; 128]) {
unsafe {
let mut ab = vld1q_u64(state.as_ptr());
let mut cd = vld1q_u64(state.as_ptr().add(2));
let mut ef = vld1q_u64(state.as_ptr().add(4));
let mut gh = vld1q_u64(state.as_ptr().add(6));
let (ab0, cd0, ef0, gh0) = (ab, cd, ef, gh);
let ld = |o: usize| vreinterpretq_u64_u8(vrev64q_u8(vld1q_u8(block.as_ptr().add(o))));
let mut s0 = ld(0);
let mut s1 = ld(16);
let mut s2 = ld(32);
let mut s3 = ld(48);
let mut s4 = ld(64);
let mut s5 = ld(80);
let mut s6 = ld(96);
let mut s7 = ld(112);
let k = |i: usize| vld1q_u64(K512.as_ptr().add(i));
let mut isum = vaddq_u64(s0, k(0));
let mut sum = vaddq_u64(vextq_u64(isum, isum, 1), gh);
let mut it = vsha512hq_u64(sum, vextq_u64(ef, gh, 1), vextq_u64(cd, ef, 1));
gh = vsha512h2q_u64(it, cd, ab);
cd = vaddq_u64(cd, it);
isum = vaddq_u64(s1, k(2));
sum = vaddq_u64(vextq_u64(isum, isum, 1), ef);
it = vsha512hq_u64(sum, vextq_u64(cd, ef, 1), vextq_u64(ab, cd, 1));
ef = vsha512h2q_u64(it, ab, gh);
ab = vaddq_u64(ab, it);
isum = vaddq_u64(s2, k(4));
sum = vaddq_u64(vextq_u64(isum, isum, 1), cd);
it = vsha512hq_u64(sum, vextq_u64(ab, cd, 1), vextq_u64(gh, ab, 1));
cd = vsha512h2q_u64(it, gh, ef);
gh = vaddq_u64(gh, it);
isum = vaddq_u64(s3, k(6));
sum = vaddq_u64(vextq_u64(isum, isum, 1), ab);
it = vsha512hq_u64(sum, vextq_u64(gh, ab, 1), vextq_u64(ef, gh, 1));
ab = vsha512h2q_u64(it, ef, cd);
ef = vaddq_u64(ef, it);
isum = vaddq_u64(s4, k(8));
sum = vaddq_u64(vextq_u64(isum, isum, 1), gh);
it = vsha512hq_u64(sum, vextq_u64(ef, gh, 1), vextq_u64(cd, ef, 1));
gh = vsha512h2q_u64(it, cd, ab);
cd = vaddq_u64(cd, it);
isum = vaddq_u64(s5, k(10));
sum = vaddq_u64(vextq_u64(isum, isum, 1), ef);
it = vsha512hq_u64(sum, vextq_u64(cd, ef, 1), vextq_u64(ab, cd, 1));
ef = vsha512h2q_u64(it, ab, gh);
ab = vaddq_u64(ab, it);
isum = vaddq_u64(s6, k(12));
sum = vaddq_u64(vextq_u64(isum, isum, 1), cd);
it = vsha512hq_u64(sum, vextq_u64(ab, cd, 1), vextq_u64(gh, ab, 1));
cd = vsha512h2q_u64(it, gh, ef);
gh = vaddq_u64(gh, it);
isum = vaddq_u64(s7, k(14));
sum = vaddq_u64(vextq_u64(isum, isum, 1), ab);
it = vsha512hq_u64(sum, vextq_u64(gh, ab, 1), vextq_u64(ef, gh, 1));
ab = vsha512h2q_u64(it, ef, cd);
ef = vaddq_u64(ef, it);
let mut t = 16usize;
while t < 80 {
s0 = vsha512su1q_u64(vsha512su0q_u64(s0, s1), s7, vextq_u64(s4, s5, 1));
isum = vaddq_u64(s0, k(t));
sum = vaddq_u64(vextq_u64(isum, isum, 1), gh);
it = vsha512hq_u64(sum, vextq_u64(ef, gh, 1), vextq_u64(cd, ef, 1));
gh = vsha512h2q_u64(it, cd, ab);
cd = vaddq_u64(cd, it);
s1 = vsha512su1q_u64(vsha512su0q_u64(s1, s2), s0, vextq_u64(s5, s6, 1));
isum = vaddq_u64(s1, k(t + 2));
sum = vaddq_u64(vextq_u64(isum, isum, 1), ef);
it = vsha512hq_u64(sum, vextq_u64(cd, ef, 1), vextq_u64(ab, cd, 1));
ef = vsha512h2q_u64(it, ab, gh);
ab = vaddq_u64(ab, it);
s2 = vsha512su1q_u64(vsha512su0q_u64(s2, s3), s1, vextq_u64(s6, s7, 1));
isum = vaddq_u64(s2, k(t + 4));
sum = vaddq_u64(vextq_u64(isum, isum, 1), cd);
it = vsha512hq_u64(sum, vextq_u64(ab, cd, 1), vextq_u64(gh, ab, 1));
cd = vsha512h2q_u64(it, gh, ef);
gh = vaddq_u64(gh, it);
s3 = vsha512su1q_u64(vsha512su0q_u64(s3, s4), s2, vextq_u64(s7, s0, 1));
isum = vaddq_u64(s3, k(t + 6));
sum = vaddq_u64(vextq_u64(isum, isum, 1), ab);
it = vsha512hq_u64(sum, vextq_u64(gh, ab, 1), vextq_u64(ef, gh, 1));
ab = vsha512h2q_u64(it, ef, cd);
ef = vaddq_u64(ef, it);
s4 = vsha512su1q_u64(vsha512su0q_u64(s4, s5), s3, vextq_u64(s0, s1, 1));
isum = vaddq_u64(s4, k(t + 8));
sum = vaddq_u64(vextq_u64(isum, isum, 1), gh);
it = vsha512hq_u64(sum, vextq_u64(ef, gh, 1), vextq_u64(cd, ef, 1));
gh = vsha512h2q_u64(it, cd, ab);
cd = vaddq_u64(cd, it);
s5 = vsha512su1q_u64(vsha512su0q_u64(s5, s6), s4, vextq_u64(s1, s2, 1));
isum = vaddq_u64(s5, k(t + 10));
sum = vaddq_u64(vextq_u64(isum, isum, 1), ef);
it = vsha512hq_u64(sum, vextq_u64(cd, ef, 1), vextq_u64(ab, cd, 1));
ef = vsha512h2q_u64(it, ab, gh);
ab = vaddq_u64(ab, it);
s6 = vsha512su1q_u64(vsha512su0q_u64(s6, s7), s5, vextq_u64(s2, s3, 1));
isum = vaddq_u64(s6, k(t + 12));
sum = vaddq_u64(vextq_u64(isum, isum, 1), cd);
it = vsha512hq_u64(sum, vextq_u64(ab, cd, 1), vextq_u64(gh, ab, 1));
cd = vsha512h2q_u64(it, gh, ef);
gh = vaddq_u64(gh, it);
s7 = vsha512su1q_u64(vsha512su0q_u64(s7, s0), s6, vextq_u64(s3, s4, 1));
isum = vaddq_u64(s7, k(t + 14));
sum = vaddq_u64(vextq_u64(isum, isum, 1), ab);
it = vsha512hq_u64(sum, vextq_u64(gh, ab, 1), vextq_u64(ef, gh, 1));
ab = vsha512h2q_u64(it, ef, cd);
ef = vaddq_u64(ef, it);
t += 16;
}
vst1q_u64(state.as_mut_ptr(), vaddq_u64(ab, ab0));
vst1q_u64(state.as_mut_ptr().add(2), vaddq_u64(cd, cd0));
vst1q_u64(state.as_mut_ptr().add(4), vaddq_u64(ef, ef0));
vst1q_u64(state.as_mut_ptr().add(6), vaddq_u64(gh, gh0));
}
}
}