#![cfg_attr(target_feature = "avx512f", allow(dead_code, unused))]
#[cfg(target_arch = "x86_64")]
use core::arch::x86_64::*;
#[cfg(target_arch = "x86")]
use core::arch::x86::*;
use crate::Align16;
use crate::sha256::K32;
const K32X4: [[u32; 4]; 16] = [
[K32[3], K32[2], K32[1], K32[0]],
[K32[7], K32[6], K32[5], K32[4]],
[K32[11], K32[10], K32[9], K32[8]],
[K32[15], K32[14], K32[13], K32[12]],
[K32[19], K32[18], K32[17], K32[16]],
[K32[23], K32[22], K32[21], K32[20]],
[K32[27], K32[26], K32[25], K32[24]],
[K32[31], K32[30], K32[29], K32[28]],
[K32[35], K32[34], K32[33], K32[32]],
[K32[39], K32[38], K32[37], K32[36]],
[K32[43], K32[42], K32[41], K32[40]],
[K32[47], K32[46], K32[45], K32[44]],
[K32[51], K32[50], K32[49], K32[48]],
[K32[55], K32[54], K32[53], K32[52]],
[K32[59], K32[58], K32[57], K32[56]],
[K32[63], K32[62], K32[61], K32[60]],
];
macro_rules! schedule {
($v0:expr, $v1:expr, $v2:expr, $v3:expr) => {{
let t1 = _mm_sha256msg1_epu32($v0, $v1);
let t2 = _mm_alignr_epi8($v3, $v2, 4);
let t3 = _mm_add_epi32(t1, t2);
_mm_sha256msg2_epu32(t3, $v3)
}};
}
#[inline(always)]
pub(crate) fn prepare_state(state: &Align16<[u32; 8]>) -> [__m128i; 2] {
unsafe {
let state_ptr = state.as_ptr().cast::<__m128i>();
let dcba = _mm_load_si128(state_ptr.add(0));
let efgh = _mm_load_si128(state_ptr.add(1));
let cdab = _mm_shuffle_epi32(dcba, 0xB1);
let efgh = _mm_shuffle_epi32(efgh, 0x1B);
let abef = _mm_alignr_epi8(cdab, efgh, 8);
let cdgh = _mm_blend_epi16(efgh, cdab, 0xF0);
[abef, cdgh]
}
}
#[allow(unused_variables)]
pub trait Plucker: Sized {
#[inline(always)]
fn pluck_qword0(&mut self, lane: usize, w: &mut __m128i) {}
#[inline(always)]
fn pluck_qword1(&mut self, lane: usize, w: &mut __m128i) {}
#[inline(always)]
fn pluck_qword2(&mut self, lane: usize, w: &mut __m128i) {}
#[inline(always)]
fn pluck_qword3(&mut self, lane: usize, w: &mut __m128i) {}
}
impl Plucker for () {}
#[inline(always)]
pub(crate) fn multiway_arx_abef_cdgh<
const BEGIN_ROUND_BY_4: usize,
const LANES: usize,
P: Plucker,
>(
mut state: [&mut [__m128i; 2]; LANES],
block_template: &Align16<[u32; 16]>,
mut plucker: P,
) {
unsafe {
macro_rules! rounds4 {
($abef:ident, $cdgh:ident, $rest:expr, $i:expr) => {{
let k = K32X4[$i];
let kv = _mm_set_epi32(k[0] as i32, k[1] as i32, k[2] as i32, k[3] as i32);
let t1: [_; LANES] = core::array::from_fn(|i| _mm_add_epi32($rest[i], kv));
$cdgh = core::array::from_fn(|i| _mm_sha256rnds2_epu32($cdgh[i], $abef[i], t1[i]));
let t2: [_; LANES] = core::array::from_fn(|i| _mm_shuffle_epi32(t1[i], 0x0E));
$abef = core::array::from_fn(|i| _mm_sha256rnds2_epu32($abef[i], $cdgh[i], t2[i]));
}};
}
macro_rules! schedule_rounds4 {
(
$abef:ident, $cdgh:ident,
$w0:expr, $w1:expr, $w2:expr, $w3:expr, $w4:expr,
$i: expr
) => {{
$w4 = core::array::from_fn(|i| schedule!($w0[i], $w1[i], $w2[i], $w3[i]));
rounds4!($abef, $cdgh, $w4, $i);
}};
}
let mut abef: [_; LANES] = core::array::from_fn(|i| state[i][0]);
let mut cdgh: [_; LANES] = core::array::from_fn(|i| state[i][1]);
let w0_t = _mm_load_si128(block_template.as_ptr().cast::<u32>().add(0).cast());
let w1_t = _mm_load_si128(block_template.as_ptr().cast::<u32>().add(4).cast());
let w2_t = _mm_load_si128(block_template.as_ptr().cast::<u32>().add(8).cast());
let w3_t = _mm_load_si128(block_template.as_ptr().cast::<u32>().add(12).cast());
let mut w0: [_; LANES] = [w0_t; LANES];
let mut w1: [_; LANES] = [w1_t; LANES];
let mut w2: [_; LANES] = [w2_t; LANES];
let mut w3: [_; LANES] = [w3_t; LANES];
for i in 0..LANES {
plucker.pluck_qword0(i, &mut w0[i]);
plucker.pluck_qword1(i, &mut w1[i]);
plucker.pluck_qword2(i, &mut w2[i]);
plucker.pluck_qword3(i, &mut w3[i]);
}
let mut w4: [_; LANES] = core::array::from_fn(|i| schedule!(w0[i], w1[i], w2[i], w3[i]));
macro_rules! gate_rnds {
($cutoff: literal, $($body:tt)*) => {
if $cutoff >= BEGIN_ROUND_BY_4 * 4 {
$($body)*
}
};
}
gate_rnds!(0, {
rounds4!(abef, cdgh, w0, 0);
});
gate_rnds!(4, {
rounds4!(abef, cdgh, w1, 1);
});
gate_rnds!(8, {
rounds4!(abef, cdgh, w2, 2);
});
gate_rnds!(12, {
rounds4!(abef, cdgh, w3, 3);
});
gate_rnds!(16, {
schedule_rounds4!(abef, cdgh, w0, w1, w2, w3, w4, 4);
});
gate_rnds!(20, {
schedule_rounds4!(abef, cdgh, w1, w2, w3, w4, w0, 5);
});
gate_rnds!(24, {
schedule_rounds4!(abef, cdgh, w2, w3, w4, w0, w1, 6);
});
gate_rnds!(28, {
schedule_rounds4!(abef, cdgh, w3, w4, w0, w1, w2, 7);
});
gate_rnds!(32, {
schedule_rounds4!(abef, cdgh, w4, w0, w1, w2, w3, 8);
});
gate_rnds!(36, {
schedule_rounds4!(abef, cdgh, w0, w1, w2, w3, w4, 9);
});
gate_rnds!(40, {
schedule_rounds4!(abef, cdgh, w1, w2, w3, w4, w0, 10);
});
gate_rnds!(44, {
schedule_rounds4!(abef, cdgh, w2, w3, w4, w0, w1, 11);
});
gate_rnds!(48, {
schedule_rounds4!(abef, cdgh, w3, w4, w0, w1, w2, 12);
});
gate_rnds!(52, {
schedule_rounds4!(abef, cdgh, w4, w0, w1, w2, w3, 13);
});
gate_rnds!(56, {
schedule_rounds4!(abef, cdgh, w0, w1, w2, w3, w4, 14);
});
gate_rnds!(60, {
schedule_rounds4!(abef, cdgh, w1, w2, w3, w4, w0, 15);
});
state.iter_mut().zip(abef).for_each(|(state, abef)| {
_mm_store_si128(
state
.as_mut_slice()
.as_mut_ptr()
.cast::<u32>()
.add(0)
.cast(),
abef,
);
});
state.iter_mut().zip(cdgh).for_each(|(state, cdgh)| {
_mm_store_si128(
state
.as_mut_slice()
.as_mut_ptr()
.cast::<u32>()
.add(4)
.cast(),
cdgh,
);
});
}
}
#[cfg(all(target_feature = "sse4.1", target_feature = "sha"))]
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_multiway_arx() {
use core::arch::x86_64::*;
let input_block = [
0x61626380, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000018,
];
let state = Align16(crate::sha256::IV);
let mut prepared_state_0 = prepare_state(&state);
let mut prepared_state_1 = prepare_state(&state);
multiway_arx_abef_cdgh::<0, 2, ()>(
[&mut prepared_state_0, &mut prepared_state_1],
&Align16(input_block),
(),
);
let a = unsafe { _mm_extract_epi32(prepared_state_0[0], 3) as u32 };
let b = unsafe { _mm_extract_epi32(prepared_state_0[0], 2) as u32 };
let ab = unsafe { _mm_extract_epi64(prepared_state_0[0], 1) };
let ab_b = ab as u32;
let ab_a = (ab >> 32) as u32;
let mut full_message_schedule = [0u32; 64];
full_message_schedule[0..16].copy_from_slice(&input_block);
crate::sha256::do_message_schedule(&mut full_message_schedule);
let mut reference_state = Align16(crate::sha256::IV);
crate::sha256::sha2_arx::<0>(&mut reference_state, &full_message_schedule);
assert_eq!(a, reference_state[0]);
assert_eq!(b, reference_state[1]);
assert_eq!(ab_a, reference_state[0], "a={} b={}", a, b);
assert_eq!(ab_b, reference_state[1], "a={} b={}", a, b);
}
}