#![allow(clippy::cast_possible_truncation, clippy::indexing_slicing)]
use core::arch::aarch64::*;
use super::kernels::init_v;
#[inline(always)]
unsafe fn ror32(x: uint64x2_t) -> uint64x2_t {
unsafe { vreinterpretq_u64_u32(vrev64q_u32(vreinterpretq_u32_u64(x))) }
}
#[inline(always)]
unsafe fn ror24(x: uint64x2_t) -> uint64x2_t {
static ROT24: [u8; 16] = [3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10];
unsafe {
let tbl = vld1q_u8(ROT24.as_ptr());
vreinterpretq_u64_u8(vqtbl1q_u8(vreinterpretq_u8_u64(x), tbl))
}
}
#[inline(always)]
unsafe fn ror16(x: uint64x2_t) -> uint64x2_t {
static ROT16: [u8; 16] = [2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9];
unsafe {
let tbl = vld1q_u8(ROT16.as_ptr());
vreinterpretq_u64_u8(vqtbl1q_u8(vreinterpretq_u8_u64(x), tbl))
}
}
#[inline(always)]
unsafe fn ror63(x: uint64x2_t) -> uint64x2_t {
unsafe { vsriq_n_u64(vaddq_u64(x, x), x, 63) }
}
#[inline(always)]
#[allow(clippy::too_many_arguments)]
unsafe fn g2(
a0: &mut uint64x2_t,
a1: &mut uint64x2_t,
b0: &mut uint64x2_t,
b1: &mut uint64x2_t,
c0: &mut uint64x2_t,
c1: &mut uint64x2_t,
d0: &mut uint64x2_t,
d1: &mut uint64x2_t,
mx0: uint64x2_t,
mx1: uint64x2_t,
my0: uint64x2_t,
my1: uint64x2_t,
) {
unsafe {
*a0 = vaddq_u64(vaddq_u64(*a0, *b0), mx0);
*a1 = vaddq_u64(vaddq_u64(*a1, *b1), mx1);
*d0 = ror32(veorq_u64(*d0, *a0));
*d1 = ror32(veorq_u64(*d1, *a1));
*c0 = vaddq_u64(*c0, *d0);
*c1 = vaddq_u64(*c1, *d1);
*b0 = ror24(veorq_u64(*b0, *c0));
*b1 = ror24(veorq_u64(*b1, *c1));
*a0 = vaddq_u64(vaddq_u64(*a0, *b0), my0);
*a1 = vaddq_u64(vaddq_u64(*a1, *b1), my1);
*d0 = ror16(veorq_u64(*d0, *a0));
*d1 = ror16(veorq_u64(*d1, *a1));
*c0 = vaddq_u64(*c0, *d0);
*c1 = vaddq_u64(*c1, *d1);
*b0 = ror63(veorq_u64(*b0, *c0));
*b1 = ror63(veorq_u64(*b1, *c1));
}
}
#[inline(always)]
unsafe fn diagonalize(
b0: &mut uint64x2_t,
b1: &mut uint64x2_t,
c0: &mut uint64x2_t,
c1: &mut uint64x2_t,
d0: &mut uint64x2_t,
d1: &mut uint64x2_t,
) {
unsafe {
let tb0 = *b0;
let tb1 = *b1;
*b0 = vextq_u64(tb0, tb1, 1);
*b1 = vextq_u64(tb1, tb0, 1);
core::mem::swap(c0, c1);
let td0 = *d0;
let td1 = *d1;
*d0 = vextq_u64(td1, td0, 1);
*d1 = vextq_u64(td0, td1, 1);
}
}
#[inline(always)]
unsafe fn undiagonalize(
b0: &mut uint64x2_t,
b1: &mut uint64x2_t,
c0: &mut uint64x2_t,
c1: &mut uint64x2_t,
d0: &mut uint64x2_t,
d1: &mut uint64x2_t,
) {
unsafe {
let tb0 = *b0;
let tb1 = *b1;
*b0 = vextq_u64(tb1, tb0, 1);
*b1 = vextq_u64(tb0, tb1, 1);
core::mem::swap(c0, c1);
let td0 = *d0;
let td1 = *d1;
*d0 = vextq_u64(td0, td1, 1);
*d1 = vextq_u64(td1, td0, 1);
}
}
#[inline(always)]
unsafe fn load_u64x2(block: &[u8; 128], offset: usize) -> uint64x2_t {
unsafe { vreinterpretq_u64_u8(vld1q_u8(block.as_ptr().add(offset))) }
}
#[inline(always)]
unsafe fn lo2(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t {
unsafe { vcombine_u64(vget_low_u64(a), vget_low_u64(b)) }
}
#[inline(always)]
unsafe fn hi2(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t {
unsafe { vcombine_u64(vget_high_u64(a), vget_high_u64(b)) }
}
#[inline(always)]
unsafe fn lo_hi(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t {
unsafe { vcombine_u64(vget_low_u64(a), vget_high_u64(b)) }
}
#[inline(always)]
unsafe fn ext1(a: uint64x2_t, b: uint64x2_t) -> uint64x2_t {
unsafe { vextq_u64(a, b, 1) }
}
macro_rules! blake2b_round {
(
$a0:ident, $a1:ident, $b0:ident, $b1:ident, $c0:ident, $c1:ident, $d0:ident, $d1:ident;
$mx0:expr, $mx1:expr; $my0:expr, $my1:expr;
$nx0:expr, $nx1:expr; $ny0:expr, $ny1:expr
) => {{
let mx0 = $mx0;
let mx1 = $mx1;
let my0 = $my0;
let my1 = $my1;
g2(
&mut $a0, &mut $a1, &mut $b0, &mut $b1, &mut $c0, &mut $c1, &mut $d0, &mut $d1, mx0, mx1, my0, my1,
);
diagonalize(&mut $b0, &mut $b1, &mut $c0, &mut $c1, &mut $d0, &mut $d1);
let mx0 = $nx0;
let mx1 = $nx1;
let my0 = $ny0;
let my1 = $ny1;
g2(
&mut $a0, &mut $a1, &mut $b0, &mut $b1, &mut $c0, &mut $c1, &mut $d0, &mut $d1, mx0, mx1, my0, my1,
);
undiagonalize(&mut $b0, &mut $b1, &mut $c0, &mut $c1, &mut $d0, &mut $d1);
}};
}
#[target_feature(enable = "neon")]
pub(super) unsafe fn compress_neon(h: &mut [u64; 8], block: &[u8; 128], t: u128, last: bool) {
unsafe {
let v = init_v(h, t, last);
let m0 = load_u64x2(block, 0);
let m1 = load_u64x2(block, 16);
let m2 = load_u64x2(block, 32);
let m3 = load_u64x2(block, 48);
let m4 = load_u64x2(block, 64);
let m5 = load_u64x2(block, 80);
let m6 = load_u64x2(block, 96);
let m7 = load_u64x2(block, 112);
let mut a0 = vld1q_u64(v.as_ptr());
let mut a1 = vld1q_u64(v.as_ptr().add(2));
let mut b0 = vld1q_u64(v.as_ptr().add(4));
let mut b1 = vld1q_u64(v.as_ptr().add(6));
let mut c0 = vld1q_u64(v.as_ptr().add(8));
let mut c1 = vld1q_u64(v.as_ptr().add(10));
let mut d0 = vld1q_u64(v.as_ptr().add(12));
let mut d1 = vld1q_u64(v.as_ptr().add(14));
blake2b_round!(
a0, a1, b0, b1, c0, c1, d0, d1;
lo2(m0, m1), lo2(m2, m3); hi2(m0, m1), hi2(m2, m3);
lo2(m4, m5), lo2(m6, m7); hi2(m4, m5), hi2(m6, m7)
);
blake2b_round!(
a0, a1, b0, b1, c0, c1, d0, d1;
lo2(m7, m2), hi2(m4, m6); lo2(m5, m4), ext1(m7, m3);
ext1(m0, m0), hi2(m5, m2); lo2(m6, m1), hi2(m3, m1)
);
blake2b_round!(
a0, a1, b0, b1, c0, c1, d0, d1;
ext1(m5, m6), hi2(m2, m7); lo2(m4, m0), lo_hi(m1, m6);
lo_hi(m5, m1), hi2(m3, m4); lo2(m7, m3), ext1(m0, m2)
);
blake2b_round!(
a0, a1, b0, b1, c0, c1, d0, d1;
hi2(m3, m1), hi2(m6, m5); hi2(m4, m0), lo2(m6, m7);
lo_hi(m1, m2), lo_hi(m2, m7); lo2(m3, m5), lo2(m0, m4)
);
blake2b_round!(
a0, a1, b0, b1, c0, c1, d0, d1;
hi2(m4, m2), lo2(m1, m5); lo_hi(m0, m3), lo_hi(m2, m7);
lo_hi(m7, m5), lo_hi(m3, m1); ext1(m0, m6), lo_hi(m4, m6)
);
blake2b_round!(
a0, a1, b0, b1, c0, c1, d0, d1;
lo2(m1, m3), lo2(m0, m4); lo2(m6, m5), hi2(m5, m1);
lo_hi(m2, m3), hi2(m7, m0); hi2(m6, m2), lo_hi(m7, m4)
);
blake2b_round!(
a0, a1, b0, b1, c0, c1, d0, d1;
lo_hi(m6, m0), lo2(m7, m2); hi2(m2, m7), ext1(m6, m5);
lo2(m0, m3), ext1(m4, m4); hi2(m3, m1), lo_hi(m1, m5)
);
blake2b_round!(
a0, a1, b0, b1, c0, c1, d0, d1;
hi2(m6, m3), lo_hi(m6, m1); ext1(m5, m7), hi2(m0, m4);
hi2(m2, m7), lo2(m4, m1); lo2(m0, m2), lo2(m3, m5)
);
blake2b_round!(
a0, a1, b0, b1, c0, c1, d0, d1;
lo2(m3, m7), ext1(m5, m0); hi2(m7, m4), ext1(m1, m4);
m6, ext1(m0, m5); lo_hi(m1, m3), m2
);
blake2b_round!(
a0, a1, b0, b1, c0, c1, d0, d1;
lo2(m5, m4), hi2(m3, m0); lo2(m1, m2), lo_hi(m3, m2);
hi2(m7, m4), hi2(m1, m6); ext1(m5, m7), lo2(m6, m0)
);
blake2b_round!(
a0, a1, b0, b1, c0, c1, d0, d1;
lo2(m0, m1), lo2(m2, m3); hi2(m0, m1), hi2(m2, m3);
lo2(m4, m5), lo2(m6, m7); hi2(m4, m5), hi2(m6, m7)
);
blake2b_round!(
a0, a1, b0, b1, c0, c1, d0, d1;
lo2(m7, m2), hi2(m4, m6); lo2(m5, m4), ext1(m7, m3);
ext1(m0, m0), hi2(m5, m2); lo2(m6, m1), hi2(m3, m1)
);
let h0 = vld1q_u64(h.as_ptr());
let h1 = vld1q_u64(h.as_ptr().add(2));
let h2 = vld1q_u64(h.as_ptr().add(4));
let h3 = vld1q_u64(h.as_ptr().add(6));
vst1q_u64(h.as_mut_ptr(), veorq_u64(h0, veorq_u64(a0, c0)));
vst1q_u64(h.as_mut_ptr().add(2), veorq_u64(h1, veorq_u64(a1, c1)));
vst1q_u64(h.as_mut_ptr().add(4), veorq_u64(h2, veorq_u64(b0, d0)));
vst1q_u64(h.as_mut_ptr().add(6), veorq_u64(h3, veorq_u64(b1, d1)));
}
}