#![allow(clippy::indexing_slicing)]
#[cfg(target_arch = "wasm32")]
use core::arch::wasm32::*;
use super::{BLOCK_LEN, K, ch, maj};
use crate::hashes::util::rotr32;
#[cfg(target_arch = "wasm32")]
#[inline(always)]
unsafe fn load_be(ptr: *const u8) -> v128 {
let raw = unsafe { v128_load(ptr as *const v128) };
i8x16_shuffle::<3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12>(raw, raw)
}
#[inline(always)]
fn big_sigma0(x: u32) -> u32 {
rotr32(x, 2) ^ rotr32(x, 13) ^ rotr32(x, 22)
}
#[inline(always)]
fn big_sigma1(x: u32) -> u32 {
rotr32(x, 6) ^ rotr32(x, 11) ^ rotr32(x, 25)
}
#[inline(always)]
fn small_sigma0(x: u32) -> u32 {
rotr32(x, 7) ^ rotr32(x, 18) ^ (x >> 3)
}
#[inline(always)]
fn small_sigma1(x: u32) -> u32 {
rotr32(x, 17) ^ rotr32(x, 19) ^ (x >> 10)
}
#[cfg(target_arch = "wasm32")]
#[inline(always)]
fn schedule_word(w: &[v128; 16], idx: usize) -> u32 {
let v = w[(idx >> 2) & 0xF];
match idx & 3 {
0 => u32x4_extract_lane::<0>(v),
1 => u32x4_extract_lane::<1>(v),
2 => u32x4_extract_lane::<2>(v),
_ => u32x4_extract_lane::<3>(v),
}
}
#[cfg(target_arch = "wasm32")]
#[inline(always)]
fn schedule_4(w: &mut [v128; 16], i: usize) {
let t = i << 2;
let mut slot = u32x4_splat(0);
let w16_0 = schedule_word(w, t - 16);
let w15_0 = schedule_word(w, t - 15);
let w7_0 = schedule_word(w, t - 7);
let w2_0 = schedule_word(w, t - 2);
let w0 = small_sigma1(w2_0)
.wrapping_add(w7_0)
.wrapping_add(small_sigma0(w15_0))
.wrapping_add(w16_0);
let w16_1 = schedule_word(w, t - 15);
let w15_1 = schedule_word(w, t - 14);
let w7_1 = schedule_word(w, t - 6);
let w2_1 = schedule_word(w, t - 1);
let w1 = small_sigma1(w2_1)
.wrapping_add(w7_1)
.wrapping_add(small_sigma0(w15_1))
.wrapping_add(w16_1);
let w16_2 = schedule_word(w, t - 14);
let w15_2 = schedule_word(w, t - 13);
let w7_2 = schedule_word(w, t - 5);
let w2_2 = w0;
let w2 = small_sigma1(w2_2)
.wrapping_add(w7_2)
.wrapping_add(small_sigma0(w15_2))
.wrapping_add(w16_2);
let w16_3 = schedule_word(w, t - 13);
let w15_3 = schedule_word(w, t - 12);
let w7_3 = schedule_word(w, t - 4);
let w2_3 = w1;
let w3 = small_sigma1(w2_3)
.wrapping_add(w7_3)
.wrapping_add(small_sigma0(w15_3))
.wrapping_add(w16_3);
slot = u32x4_replace_lane::<0>(slot, w0);
slot = u32x4_replace_lane::<1>(slot, w1);
slot = u32x4_replace_lane::<2>(slot, w2);
slot = u32x4_replace_lane::<3>(slot, w3);
w[i & 0xF] = slot;
}
#[cfg(target_arch = "wasm32")]
#[target_feature(enable = "simd128")]
pub(crate) unsafe fn compress_blocks_wasm_simd(state: &mut [u32; 8], blocks: &[u8]) {
debug_assert_eq!(blocks.len() % BLOCK_LEN, 0);
if blocks.is_empty() {
return;
}
let num_blocks = blocks.len() / BLOCK_LEN;
let mut ptr = blocks.as_ptr();
for _ in 0..num_blocks {
let mut a = state[0];
let mut b = state[1];
let mut c = state[2];
let mut d = state[3];
let mut e = state[4];
let mut f = state[5];
let mut g = state[6];
let mut h = state[7];
let mut wv: [v128; 16] = [
unsafe { load_be(ptr) },
unsafe { load_be(ptr.add(16)) },
unsafe { load_be(ptr.add(32)) },
unsafe { load_be(ptr.add(48)) },
u32x4_splat(0),
u32x4_splat(0),
u32x4_splat(0),
u32x4_splat(0),
u32x4_splat(0),
u32x4_splat(0),
u32x4_splat(0),
u32x4_splat(0),
u32x4_splat(0),
u32x4_splat(0),
u32x4_splat(0),
u32x4_splat(0),
];
macro_rules! sha_round {
($k:expr, $w:expr) => {{
let t1 = h
.wrapping_add(big_sigma1(e))
.wrapping_add(ch(e, f, g))
.wrapping_add($k)
.wrapping_add($w);
let t2 = big_sigma0(a).wrapping_add(maj(a, b, c));
h = g;
g = f;
f = e;
e = d.wrapping_add(t1);
d = c;
c = b;
b = a;
a = t1.wrapping_add(t2);
}};
}
for r in 0..16 {
let wi = u32x4_extract_lane::<0>(
match r % 4 {
0 => wv[r / 4],
1 => i32x4_shuffle::<1, 0, 0, 0>(wv[r / 4], wv[r / 4]),
2 => i32x4_shuffle::<2, 0, 0, 0>(wv[r / 4], wv[r / 4]),
_ => i32x4_shuffle::<3, 0, 0, 0>(wv[r / 4], wv[r / 4]),
},
);
sha_round!(K[r], wi);
}
for r in (16..64).step_by(4) {
schedule_4(&mut wv, r / 4);
let sched = wv[(r / 4) & 0xF];
sha_round!(K[r], u32x4_extract_lane::<0>(sched));
sha_round!(K[r + 1], u32x4_extract_lane::<1>(sched));
sha_round!(K[r + 2], u32x4_extract_lane::<2>(sched));
sha_round!(K[r + 3], u32x4_extract_lane::<3>(sched));
}
state[0] = state[0].wrapping_add(a);
state[1] = state[1].wrapping_add(b);
state[2] = state[2].wrapping_add(c);
state[3] = state[3].wrapping_add(d);
state[4] = state[4].wrapping_add(e);
state[5] = state[5].wrapping_add(f);
state[6] = state[6].wrapping_add(g);
state[7] = state[7].wrapping_add(h);
ptr = unsafe { ptr.add(64) };
}
}