#![allow(unsafe_code)]
#![allow(clippy::inline_always)]
#[cfg(target_arch = "aarch64")]
use core::arch::aarch64::*;
use crate::hashes::util::Aligned64;
#[cfg(target_arch = "aarch64")]
static K32: Aligned64<[u32; 64]> = Aligned64([
0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, 0xd807aa98,
0x12835b01, 0x243185be, 0x550c7dc3, 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, 0xe49b69c1, 0xefbe4786,
0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, 0x983e5152, 0xa831c66d, 0xb00327c8,
0xbf597fc7, 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 0xd192e819,
0xd6990624, 0xf40e3585, 0x106aa070, 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a,
0x5b9cca4f, 0x682e6ff3, 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 0x90befffa, 0xa4506ceb, 0xbef9a3f7,
0xc67178f2,
]);
#[cfg(target_arch = "aarch64")]
#[target_feature(enable = "sha2")]
pub(crate) unsafe fn compress_single_block_aarch64_sha2(state: &mut [u32; 8], block: &[u8; 64]) {
unsafe {
let mut abcd = vld1q_u32(state.as_ptr());
let mut efgh = vld1q_u32(state.as_ptr().add(4));
let abcd_save = abcd;
let efgh_save = efgh;
let ptr = block.as_ptr();
let kp = K32.0.as_ptr();
let mut s0 = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(ptr)));
let mut s1 = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(ptr.add(16))));
let mut s2 = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(ptr.add(32))));
let mut s3 = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(ptr.add(48))));
let mut tmp = vaddq_u32(s0, vld1q_u32(kp));
let mut abcd_prev = abcd;
abcd = vsha256hq_u32(abcd_prev, efgh, tmp);
efgh = vsha256h2q_u32(efgh, abcd_prev, tmp);
tmp = vaddq_u32(s1, vld1q_u32(kp.add(4)));
abcd_prev = abcd;
abcd = vsha256hq_u32(abcd_prev, efgh, tmp);
efgh = vsha256h2q_u32(efgh, abcd_prev, tmp);
tmp = vaddq_u32(s2, vld1q_u32(kp.add(8)));
abcd_prev = abcd;
abcd = vsha256hq_u32(abcd_prev, efgh, tmp);
efgh = vsha256h2q_u32(efgh, abcd_prev, tmp);
tmp = vaddq_u32(s3, vld1q_u32(kp.add(12)));
abcd_prev = abcd;
abcd = vsha256hq_u32(abcd_prev, efgh, tmp);
efgh = vsha256h2q_u32(efgh, abcd_prev, tmp);
s0 = vsha256su1q_u32(vsha256su0q_u32(s0, s1), s2, s3);
s1 = vsha256su1q_u32(vsha256su0q_u32(s1, s2), s3, s0);
s2 = vsha256su1q_u32(vsha256su0q_u32(s2, s3), s0, s1);
s3 = vsha256su1q_u32(vsha256su0q_u32(s3, s0), s1, s2);
tmp = vaddq_u32(s0, vld1q_u32(kp.add(16)));
abcd_prev = abcd;
abcd = vsha256hq_u32(abcd_prev, efgh, tmp);
efgh = vsha256h2q_u32(efgh, abcd_prev, tmp);
tmp = vaddq_u32(s1, vld1q_u32(kp.add(20)));
abcd_prev = abcd;
abcd = vsha256hq_u32(abcd_prev, efgh, tmp);
efgh = vsha256h2q_u32(efgh, abcd_prev, tmp);
tmp = vaddq_u32(s2, vld1q_u32(kp.add(24)));
abcd_prev = abcd;
abcd = vsha256hq_u32(abcd_prev, efgh, tmp);
efgh = vsha256h2q_u32(efgh, abcd_prev, tmp);
tmp = vaddq_u32(s3, vld1q_u32(kp.add(28)));
abcd_prev = abcd;
abcd = vsha256hq_u32(abcd_prev, efgh, tmp);
efgh = vsha256h2q_u32(efgh, abcd_prev, tmp);
s0 = vsha256su1q_u32(vsha256su0q_u32(s0, s1), s2, s3);
s1 = vsha256su1q_u32(vsha256su0q_u32(s1, s2), s3, s0);
s2 = vsha256su1q_u32(vsha256su0q_u32(s2, s3), s0, s1);
s3 = vsha256su1q_u32(vsha256su0q_u32(s3, s0), s1, s2);
tmp = vaddq_u32(s0, vld1q_u32(kp.add(32)));
abcd_prev = abcd;
abcd = vsha256hq_u32(abcd_prev, efgh, tmp);
efgh = vsha256h2q_u32(efgh, abcd_prev, tmp);
tmp = vaddq_u32(s1, vld1q_u32(kp.add(36)));
abcd_prev = abcd;
abcd = vsha256hq_u32(abcd_prev, efgh, tmp);
efgh = vsha256h2q_u32(efgh, abcd_prev, tmp);
tmp = vaddq_u32(s2, vld1q_u32(kp.add(40)));
abcd_prev = abcd;
abcd = vsha256hq_u32(abcd_prev, efgh, tmp);
efgh = vsha256h2q_u32(efgh, abcd_prev, tmp);
tmp = vaddq_u32(s3, vld1q_u32(kp.add(44)));
abcd_prev = abcd;
abcd = vsha256hq_u32(abcd_prev, efgh, tmp);
efgh = vsha256h2q_u32(efgh, abcd_prev, tmp);
s0 = vsha256su1q_u32(vsha256su0q_u32(s0, s1), s2, s3);
s1 = vsha256su1q_u32(vsha256su0q_u32(s1, s2), s3, s0);
s2 = vsha256su1q_u32(vsha256su0q_u32(s2, s3), s0, s1);
s3 = vsha256su1q_u32(vsha256su0q_u32(s3, s0), s1, s2);
tmp = vaddq_u32(s0, vld1q_u32(kp.add(48)));
abcd_prev = abcd;
abcd = vsha256hq_u32(abcd_prev, efgh, tmp);
efgh = vsha256h2q_u32(efgh, abcd_prev, tmp);
tmp = vaddq_u32(s1, vld1q_u32(kp.add(52)));
abcd_prev = abcd;
abcd = vsha256hq_u32(abcd_prev, efgh, tmp);
efgh = vsha256h2q_u32(efgh, abcd_prev, tmp);
tmp = vaddq_u32(s2, vld1q_u32(kp.add(56)));
abcd_prev = abcd;
abcd = vsha256hq_u32(abcd_prev, efgh, tmp);
efgh = vsha256h2q_u32(efgh, abcd_prev, tmp);
tmp = vaddq_u32(s3, vld1q_u32(kp.add(60)));
abcd_prev = abcd;
abcd = vsha256hq_u32(abcd_prev, efgh, tmp);
efgh = vsha256h2q_u32(efgh, abcd_prev, tmp);
abcd = vaddq_u32(abcd, abcd_save);
efgh = vaddq_u32(efgh, efgh_save);
vst1q_u32(state.as_mut_ptr(), abcd);
vst1q_u32(state.as_mut_ptr().add(4), efgh);
} }
#[cfg(target_arch = "aarch64")]
#[target_feature(enable = "sha2")]
pub(crate) unsafe fn compress_blocks_aarch64_sha2(state: &mut [u32; 8], blocks: &[u8]) {
debug_assert_eq!(blocks.len() % 64, 0);
let (blocks, remainder) = blocks.as_chunks::<64>();
debug_assert!(remainder.is_empty());
if blocks.is_empty() {
return;
}
if blocks.len() == 1 {
unsafe {
compress_single_block_aarch64_sha2(state, &blocks[0]);
}
return;
}
unsafe {
let mut abcd = vld1q_u32(state.as_ptr());
let mut efgh = vld1q_u32(state.as_ptr().add(4));
let kp = K32.0.as_ptr();
for block in blocks {
let abcd_save = abcd;
let efgh_save = efgh;
let ptr = block.as_ptr();
let mut s0 = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(ptr)));
let mut s1 = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(ptr.add(16))));
let mut s2 = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(ptr.add(32))));
let mut s3 = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(ptr.add(48))));
let mut tmp = vaddq_u32(s0, vld1q_u32(kp));
let mut abcd_prev = abcd;
abcd = vsha256hq_u32(abcd_prev, efgh, tmp);
efgh = vsha256h2q_u32(efgh, abcd_prev, tmp);
tmp = vaddq_u32(s1, vld1q_u32(kp.add(4)));
abcd_prev = abcd;
abcd = vsha256hq_u32(abcd_prev, efgh, tmp);
efgh = vsha256h2q_u32(efgh, abcd_prev, tmp);
tmp = vaddq_u32(s2, vld1q_u32(kp.add(8)));
abcd_prev = abcd;
abcd = vsha256hq_u32(abcd_prev, efgh, tmp);
efgh = vsha256h2q_u32(efgh, abcd_prev, tmp);
tmp = vaddq_u32(s3, vld1q_u32(kp.add(12)));
abcd_prev = abcd;
abcd = vsha256hq_u32(abcd_prev, efgh, tmp);
efgh = vsha256h2q_u32(efgh, abcd_prev, tmp);
for t in (16..64).step_by(16) {
s0 = vsha256su1q_u32(vsha256su0q_u32(s0, s1), s2, s3);
tmp = vaddq_u32(s0, vld1q_u32(kp.add(t)));
abcd_prev = abcd;
abcd = vsha256hq_u32(abcd_prev, efgh, tmp);
efgh = vsha256h2q_u32(efgh, abcd_prev, tmp);
s1 = vsha256su1q_u32(vsha256su0q_u32(s1, s2), s3, s0);
tmp = vaddq_u32(s1, vld1q_u32(kp.add(t + 4)));
abcd_prev = abcd;
abcd = vsha256hq_u32(abcd_prev, efgh, tmp);
efgh = vsha256h2q_u32(efgh, abcd_prev, tmp);
s2 = vsha256su1q_u32(vsha256su0q_u32(s2, s3), s0, s1);
tmp = vaddq_u32(s2, vld1q_u32(kp.add(t + 8)));
abcd_prev = abcd;
abcd = vsha256hq_u32(abcd_prev, efgh, tmp);
efgh = vsha256h2q_u32(efgh, abcd_prev, tmp);
s3 = vsha256su1q_u32(vsha256su0q_u32(s3, s0), s1, s2);
tmp = vaddq_u32(s3, vld1q_u32(kp.add(t + 12)));
abcd_prev = abcd;
abcd = vsha256hq_u32(abcd_prev, efgh, tmp);
efgh = vsha256h2q_u32(efgh, abcd_prev, tmp);
}
abcd = vaddq_u32(abcd, abcd_save);
efgh = vaddq_u32(efgh, efgh_save);
}
vst1q_u32(state.as_mut_ptr(), abcd);
vst1q_u32(state.as_mut_ptr().add(4), efgh);
} }